diff --git a/.claude/rules/loc-exceptions.txt b/.claude/rules/loc-exceptions.txt
index 76b4aa7..c3cced7 100644
--- a/.claude/rules/loc-exceptions.txt
+++ b/.claude/rules/loc-exceptions.txt
@@ -17,6 +17,7 @@
 
 # Pure Data Registries (keine Logik, nur Daten-Definitionen)
 **/dsfa_sources_registry.py | owner=klausur | reason=Pure data registry (license + source definitions, no logic) | review=2027-01-01
+**/legal_corpus_registry.py | owner=klausur | reason=Pure data registry (Regulation dataclass + 47 regulation definitions, no logic) | review=2027-01-01
 **/backlog/backlog-items.ts | owner=admin-lehrer | reason=Pure data array (506 LOC, no logic, only BacklogItem[] literals) | review=2027-01-01
 **/lib/module-registry-data.ts | owner=admin-lehrer | reason=Pure data array (510 LOC, no logic, only BackendModule[] literals) | review=2027-01-01
 
diff --git a/backend-lehrer/classroom_engine/repository.py b/backend-lehrer/classroom_engine/repository.py
index 66a0302..777d714 100644
--- a/backend-lehrer/classroom_engine/repository.py
+++ b/backend-lehrer/classroom_engine/repository.py
@@ -1,1705 +1,33 @@
 """
-Session Repository - CRUD Operationen fuer Classroom Sessions (Feature f14).
+Session Repository - Re-export Hub.
 
-Abstraktion der Datenbank-Operationen fuer LessonSessions.
+Alle Repository-Klassen werden aus ihren Domain-Modulen re-exportiert,
+damit bestehende Imports unveraendert funktionieren:
+
+    from classroom_engine.repository import SessionRepository
+    from .repository import TeacherContextRepository
 """
-from datetime import datetime
-from typing import Optional, List, Dict, Any
-from sqlalchemy.orm import Session as DBSession
-
-from .db_models import (
-    LessonSessionDB, PhaseHistoryDB, LessonTemplateDB, TeacherSettingsDB,
-    LessonPhaseEnum, HomeworkDB, HomeworkStatusEnum, PhaseMaterialDB, MaterialTypeEnum,
-    LessonReflectionDB, TeacherFeedbackDB, FeedbackTypeEnum, FeedbackStatusEnum,
-    FeedbackPriorityEnum
-)
-from .context_models import (
-    TeacherContextDB, SchoolyearEventDB, RecurringRoutineDB,
-    MacroPhaseEnum, EventTypeEnum, EventStatusEnum,
-    RoutineTypeEnum, RecurrencePatternEnum,
-    FEDERAL_STATES, SCHOOL_TYPES
-)
-from .models import (
-    LessonSession, LessonTemplate, LessonPhase, Homework, HomeworkStatus,
-    PhaseMaterial, MaterialType, get_default_durations
-)
-from .analytics import (
-    LessonReflection, SessionSummary, TeacherAnalytics, AnalyticsCalculator
+from .repository_session import SessionRepository, TeacherSettingsRepository
+from .repository_template import TemplateRepository
+from .repository_homework import HomeworkRepository, MaterialRepository
+from .repository_reflection import ReflectionRepository, AnalyticsRepository
+from .repository_feedback import TeacherFeedbackRepository
+from .repository_context import (
+    TeacherContextRepository,
+    SchoolyearEventRepository,
+    RecurringRoutineRepository,
 )
 
-
-class SessionRepository:
-    """Repository fuer LessonSession CRUD-Operationen."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    # ==================== CREATE ====================
-
-    def create(self, session: LessonSession) -> LessonSessionDB:
-        """
-        Erstellt eine neue Session in der Datenbank.
-
-        Args:
-            session: LessonSession Dataclass
-
-        Returns:
-            LessonSessionDB Model
-        """
-        db_session = LessonSessionDB(
-            id=session.session_id,
-            teacher_id=session.teacher_id,
-            class_id=session.class_id,
-            subject=session.subject,
-            topic=session.topic,
-            current_phase=LessonPhaseEnum(session.current_phase.value),
-            is_paused=session.is_paused,
-            lesson_started_at=session.lesson_started_at,
-            lesson_ended_at=session.lesson_ended_at,
-            phase_started_at=session.phase_started_at,
-            pause_started_at=session.pause_started_at,
-            total_paused_seconds=session.total_paused_seconds,
-            phase_durations=session.phase_durations,
-            phase_history=session.phase_history,
-            notes=session.notes,
-            homework=session.homework,
-        )
-        self.db.add(db_session)
-        self.db.commit()
-        self.db.refresh(db_session)
-        return db_session
-
-    # ==================== READ ====================
-
-    def get_by_id(self, session_id: str) -> Optional[LessonSessionDB]:
-        """Holt eine Session nach ID."""
-        return self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.id == session_id
-        ).first()
-
-    def get_active_by_teacher(self, teacher_id: str) -> List[LessonSessionDB]:
-        """Holt alle aktiven Sessions eines Lehrers."""
-        return self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.teacher_id == teacher_id,
-            LessonSessionDB.current_phase != LessonPhaseEnum.ENDED
-        ).all()
-
-    def get_history_by_teacher(
-        self,
-        teacher_id: str,
-        limit: int = 20,
-        offset: int = 0
-    ) -> List[LessonSessionDB]:
-        """Holt Session-History eines Lehrers (Feature f17)."""
-        return self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.teacher_id == teacher_id,
-            LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
-        ).order_by(
-            LessonSessionDB.lesson_ended_at.desc()
-        ).offset(offset).limit(limit).all()
-
-    def get_by_class(
-        self,
-        class_id: str,
-        limit: int = 20
-    ) -> List[LessonSessionDB]:
-        """Holt Sessions einer Klasse."""
-        return self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.class_id == class_id
-        ).order_by(
-            LessonSessionDB.created_at.desc()
-        ).limit(limit).all()
-
-    # ==================== UPDATE ====================
-
-    def update(self, session: LessonSession) -> Optional[LessonSessionDB]:
-        """
-        Aktualisiert eine bestehende Session.
-
-        Args:
-            session: LessonSession Dataclass mit aktualisierten Werten
-
-        Returns:
-            Aktualisierte LessonSessionDB oder None
-        """
-        db_session = self.get_by_id(session.session_id)
-        if not db_session:
-            return None
-
-        db_session.current_phase = LessonPhaseEnum(session.current_phase.value)
-        db_session.is_paused = session.is_paused
-        db_session.lesson_started_at = session.lesson_started_at
-        db_session.lesson_ended_at = session.lesson_ended_at
-        db_session.phase_started_at = session.phase_started_at
-        db_session.pause_started_at = session.pause_started_at
-        db_session.total_paused_seconds = session.total_paused_seconds
-        db_session.phase_durations = session.phase_durations
-        db_session.phase_history = session.phase_history
-        db_session.notes = session.notes
-        db_session.homework = session.homework
-
-        self.db.commit()
-        self.db.refresh(db_session)
-        return db_session
-
-    def update_notes(
-        self,
-        session_id: str,
-        notes: str,
-        homework: str
-    ) -> Optional[LessonSessionDB]:
-        """Aktualisiert nur Notizen und Hausaufgaben."""
-        db_session = self.get_by_id(session_id)
-        if not db_session:
-            return None
-
-        db_session.notes = notes
-        db_session.homework = homework
-
-        self.db.commit()
-        self.db.refresh(db_session)
-        return db_session
-
-    # ==================== DELETE ====================
-
-    def delete(self, session_id: str) -> bool:
-        """Loescht eine Session."""
-        db_session = self.get_by_id(session_id)
-        if not db_session:
-            return False
-
-        self.db.delete(db_session)
-        self.db.commit()
-        return True
-
-    # ==================== CONVERSION ====================
-
-    def to_dataclass(self, db_session: LessonSessionDB) -> LessonSession:
-        """
-        Konvertiert DB-Model zu Dataclass.
-
-        Args:
-            db_session: LessonSessionDB Model
-
-        Returns:
-            LessonSession Dataclass
-        """
-        return LessonSession(
-            session_id=db_session.id,
-            teacher_id=db_session.teacher_id,
-            class_id=db_session.class_id,
-            subject=db_session.subject,
-            topic=db_session.topic,
-            current_phase=LessonPhase(db_session.current_phase.value),
-            phase_started_at=db_session.phase_started_at,
-            lesson_started_at=db_session.lesson_started_at,
-            lesson_ended_at=db_session.lesson_ended_at,
-            is_paused=db_session.is_paused,
-            pause_started_at=db_session.pause_started_at,
-            total_paused_seconds=db_session.total_paused_seconds or 0,
-            phase_durations=db_session.phase_durations or get_default_durations(),
-            phase_history=db_session.phase_history or [],
-            notes=db_session.notes or "",
-            homework=db_session.homework or "",
-        )
-
-
-class TeacherSettingsRepository:
-    """Repository fuer Lehrer-Einstellungen (Feature f16)."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    def get_or_create(self, teacher_id: str) -> TeacherSettingsDB:
-        """Holt oder erstellt Einstellungen fuer einen Lehrer."""
-        settings = self.db.query(TeacherSettingsDB).filter(
-            TeacherSettingsDB.teacher_id == teacher_id
-        ).first()
-
-        if not settings:
-            settings = TeacherSettingsDB(
-                teacher_id=teacher_id,
-                default_phase_durations=get_default_durations(),
-            )
-            self.db.add(settings)
-            self.db.commit()
-            self.db.refresh(settings)
-
-        return settings
-
-    def update_phase_durations(
-        self,
-        teacher_id: str,
-        durations: Dict[str, int]
-    ) -> TeacherSettingsDB:
-        """Aktualisiert die Standard-Phasendauern."""
-        settings = self.get_or_create(teacher_id)
-        settings.default_phase_durations = durations
-        self.db.commit()
-        self.db.refresh(settings)
-        return settings
-
-    def update_preferences(
-        self,
-        teacher_id: str,
-        audio_enabled: Optional[bool] = None,
-        high_contrast: Optional[bool] = None,
-        show_statistics: Optional[bool] = None
-    ) -> TeacherSettingsDB:
-        """Aktualisiert UI-Praeferenzen."""
-        settings = self.get_or_create(teacher_id)
-
-        if audio_enabled is not None:
-            settings.audio_enabled = audio_enabled
-        if high_contrast is not None:
-            settings.high_contrast = high_contrast
-        if show_statistics is not None:
-            settings.show_statistics = show_statistics
-
-        self.db.commit()
-        self.db.refresh(settings)
-        return settings
-
-
-class TemplateRepository:
-    """Repository fuer Stunden-Vorlagen (Feature f37)."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    # ==================== CREATE ====================
-
-    def create(self, template: LessonTemplate) -> LessonTemplateDB:
-        """Erstellt eine neue Vorlage."""
-        db_template = LessonTemplateDB(
-            id=template.template_id,
-            teacher_id=template.teacher_id,
-            name=template.name,
-            description=template.description,
-            subject=template.subject,
-            grade_level=template.grade_level,
-            phase_durations=template.phase_durations,
-            default_topic=template.default_topic,
-            default_notes=template.default_notes,
-            is_public=template.is_public,
-            usage_count=template.usage_count,
-        )
-        self.db.add(db_template)
-        self.db.commit()
-        self.db.refresh(db_template)
-        return db_template
-
-    # ==================== READ ====================
-
-    def get_by_id(self, template_id: str) -> Optional[LessonTemplateDB]:
-        """Holt eine Vorlage nach ID."""
-        return self.db.query(LessonTemplateDB).filter(
-            LessonTemplateDB.id == template_id
-        ).first()
-
-    def get_by_teacher(
-        self,
-        teacher_id: str,
-        include_public: bool = True
-    ) -> List[LessonTemplateDB]:
-        """
-        Holt alle Vorlagen eines Lehrers.
-
-        Args:
-            teacher_id: ID des Lehrers
-            include_public: Auch oeffentliche Vorlagen anderer Lehrer einbeziehen
-        """
-        if include_public:
-            return self.db.query(LessonTemplateDB).filter(
-                (LessonTemplateDB.teacher_id == teacher_id) |
-                (LessonTemplateDB.is_public == True)
-            ).order_by(
-                LessonTemplateDB.usage_count.desc()
-            ).all()
-        else:
-            return self.db.query(LessonTemplateDB).filter(
-                LessonTemplateDB.teacher_id == teacher_id
-            ).order_by(
-                LessonTemplateDB.created_at.desc()
-            ).all()
-
-    def get_public_templates(self, limit: int = 20) -> List[LessonTemplateDB]:
-        """Holt oeffentliche Vorlagen, sortiert nach Beliebtheit."""
-        return self.db.query(LessonTemplateDB).filter(
-            LessonTemplateDB.is_public == True
-        ).order_by(
-            LessonTemplateDB.usage_count.desc()
-        ).limit(limit).all()
-
-    def get_by_subject(
-        self,
-        subject: str,
-        teacher_id: Optional[str] = None
-    ) -> List[LessonTemplateDB]:
-        """Holt Vorlagen fuer ein bestimmtes Fach."""
-        query = self.db.query(LessonTemplateDB).filter(
-            LessonTemplateDB.subject == subject
-        )
-        if teacher_id:
-            query = query.filter(
-                (LessonTemplateDB.teacher_id == teacher_id) |
-                (LessonTemplateDB.is_public == True)
-            )
-        else:
-            query = query.filter(LessonTemplateDB.is_public == True)
-
-        return query.order_by(
-            LessonTemplateDB.usage_count.desc()
-        ).all()
-
-    # ==================== UPDATE ====================
-
-    def update(self, template: LessonTemplate) -> Optional[LessonTemplateDB]:
-        """Aktualisiert eine Vorlage."""
-        db_template = self.get_by_id(template.template_id)
-        if not db_template:
-            return None
-
-        db_template.name = template.name
-        db_template.description = template.description
-        db_template.subject = template.subject
-        db_template.grade_level = template.grade_level
-        db_template.phase_durations = template.phase_durations
-        db_template.default_topic = template.default_topic
-        db_template.default_notes = template.default_notes
-        db_template.is_public = template.is_public
-
-        self.db.commit()
-        self.db.refresh(db_template)
-        return db_template
-
-    def increment_usage(self, template_id: str) -> Optional[LessonTemplateDB]:
-        """Erhoeht den Usage-Counter einer Vorlage."""
-        db_template = self.get_by_id(template_id)
-        if not db_template:
-            return None
-
-        db_template.usage_count += 1
-        self.db.commit()
-        self.db.refresh(db_template)
-        return db_template
-
-    # ==================== DELETE ====================
-
-    def delete(self, template_id: str) -> bool:
-        """Loescht eine Vorlage."""
-        db_template = self.get_by_id(template_id)
-        if not db_template:
-            return False
-
-        self.db.delete(db_template)
-        self.db.commit()
-        return True
-
-    # ==================== CONVERSION ====================
-
-    def to_dataclass(self, db_template: LessonTemplateDB) -> LessonTemplate:
-        """Konvertiert DB-Model zu Dataclass."""
-        return LessonTemplate(
-            template_id=db_template.id,
-            teacher_id=db_template.teacher_id,
-            name=db_template.name,
-            description=db_template.description or "",
-            subject=db_template.subject or "",
-            grade_level=db_template.grade_level or "",
-            phase_durations=db_template.phase_durations or get_default_durations(),
-            default_topic=db_template.default_topic or "",
-            default_notes=db_template.default_notes or "",
-            is_public=db_template.is_public,
-            usage_count=db_template.usage_count,
-            created_at=db_template.created_at,
-            updated_at=db_template.updated_at,
-        )
-
-
-class HomeworkRepository:
-    """Repository fuer Hausaufgaben-Tracking (Feature f20)."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    # ==================== CREATE ====================
-
-    def create(self, homework: Homework) -> HomeworkDB:
-        """Erstellt eine neue Hausaufgabe."""
-        db_homework = HomeworkDB(
-            id=homework.homework_id,
-            teacher_id=homework.teacher_id,
-            class_id=homework.class_id,
-            subject=homework.subject,
-            title=homework.title,
-            description=homework.description,
-            session_id=homework.session_id,
-            due_date=homework.due_date,
-            status=HomeworkStatusEnum(homework.status.value),
-        )
-        self.db.add(db_homework)
-        self.db.commit()
-        self.db.refresh(db_homework)
-        return db_homework
-
-    # ==================== READ ====================
-
-    def get_by_id(self, homework_id: str) -> Optional[HomeworkDB]:
-        """Holt eine Hausaufgabe nach ID."""
-        return self.db.query(HomeworkDB).filter(
-            HomeworkDB.id == homework_id
-        ).first()
-
-    def get_by_teacher(
-        self,
-        teacher_id: str,
-        status: Optional[str] = None,
-        limit: int = 50
-    ) -> List[HomeworkDB]:
-        """Holt alle Hausaufgaben eines Lehrers."""
-        query = self.db.query(HomeworkDB).filter(
-            HomeworkDB.teacher_id == teacher_id
-        )
-        if status:
-            query = query.filter(HomeworkDB.status == HomeworkStatusEnum(status))
-        return query.order_by(
-            HomeworkDB.due_date.asc().nullslast(),
-            HomeworkDB.created_at.desc()
-        ).limit(limit).all()
-
-    def get_by_class(
-        self,
-        class_id: str,
-        teacher_id: str,
-        include_completed: bool = False,
-        limit: int = 20
-    ) -> List[HomeworkDB]:
-        """Holt alle Hausaufgaben einer Klasse."""
-        query = self.db.query(HomeworkDB).filter(
-            HomeworkDB.class_id == class_id,
-            HomeworkDB.teacher_id == teacher_id
-        )
-        if not include_completed:
-            query = query.filter(HomeworkDB.status != HomeworkStatusEnum.COMPLETED)
-        return query.order_by(
-            HomeworkDB.due_date.asc().nullslast(),
-            HomeworkDB.created_at.desc()
-        ).limit(limit).all()
-
-    def get_by_session(self, session_id: str) -> List[HomeworkDB]:
-        """Holt alle Hausaufgaben einer Session."""
-        return self.db.query(HomeworkDB).filter(
-            HomeworkDB.session_id == session_id
-        ).order_by(HomeworkDB.created_at.desc()).all()
-
-    def get_pending(
-        self,
-        teacher_id: str,
-        days_ahead: int = 7
-    ) -> List[HomeworkDB]:
-        """Holt anstehende Hausaufgaben der naechsten X Tage."""
-        from datetime import timedelta
-        cutoff = datetime.utcnow() + timedelta(days=days_ahead)
-        return self.db.query(HomeworkDB).filter(
-            HomeworkDB.teacher_id == teacher_id,
-            HomeworkDB.status.in_([HomeworkStatusEnum.ASSIGNED, HomeworkStatusEnum.IN_PROGRESS]),
-            HomeworkDB.due_date <= cutoff
-        ).order_by(HomeworkDB.due_date.asc()).all()
-
-    # ==================== UPDATE ====================
-
-    def update_status(
-        self,
-        homework_id: str,
-        status: HomeworkStatus
-    ) -> Optional[HomeworkDB]:
-        """Aktualisiert den Status einer Hausaufgabe."""
-        db_homework = self.get_by_id(homework_id)
-        if not db_homework:
-            return None
-
-        db_homework.status = HomeworkStatusEnum(status.value)
-        self.db.commit()
-        self.db.refresh(db_homework)
-        return db_homework
-
-    def update(self, homework: Homework) -> Optional[HomeworkDB]:
-        """Aktualisiert eine Hausaufgabe."""
-        db_homework = self.get_by_id(homework.homework_id)
-        if not db_homework:
-            return None
-
-        db_homework.title = homework.title
-        db_homework.description = homework.description
-        db_homework.due_date = homework.due_date
-        db_homework.status = HomeworkStatusEnum(homework.status.value)
-
-        self.db.commit()
-        self.db.refresh(db_homework)
-        return db_homework
-
-    # ==================== DELETE ====================
-
-    def delete(self, homework_id: str) -> bool:
-        """Loescht eine Hausaufgabe."""
-        db_homework = self.get_by_id(homework_id)
-        if not db_homework:
-            return False
-
-        self.db.delete(db_homework)
-        self.db.commit()
-        return True
-
-    # ==================== CONVERSION ====================
-
-    def to_dataclass(self, db_homework: HomeworkDB) -> Homework:
-        """Konvertiert DB-Model zu Dataclass."""
-        return Homework(
-            homework_id=db_homework.id,
-            teacher_id=db_homework.teacher_id,
-            class_id=db_homework.class_id,
-            subject=db_homework.subject,
-            title=db_homework.title,
-            description=db_homework.description or "",
-            session_id=db_homework.session_id,
-            due_date=db_homework.due_date,
-            status=HomeworkStatus(db_homework.status.value),
-            created_at=db_homework.created_at,
-            updated_at=db_homework.updated_at,
-        )
-
-
-class MaterialRepository:
-    """Repository fuer Phasen-Materialien (Feature f19)."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    # ==================== CREATE ====================
-
-    def create(self, material: PhaseMaterial) -> PhaseMaterialDB:
-        """Erstellt ein neues Material."""
-        db_material = PhaseMaterialDB(
-            id=material.material_id,
-            teacher_id=material.teacher_id,
-            title=material.title,
-            material_type=MaterialTypeEnum(material.material_type.value),
-            url=material.url,
-            description=material.description,
-            phase=material.phase,
-            subject=material.subject,
-            grade_level=material.grade_level,
-            tags=material.tags,
-            is_public=material.is_public,
-            usage_count=material.usage_count,
-            session_id=material.session_id,
-        )
-        self.db.add(db_material)
-        self.db.commit()
-        self.db.refresh(db_material)
-        return db_material
-
-    # ==================== READ ====================
-
-    def get_by_id(self, material_id: str) -> Optional[PhaseMaterialDB]:
-        """Holt ein Material nach ID."""
-        return self.db.query(PhaseMaterialDB).filter(
-            PhaseMaterialDB.id == material_id
-        ).first()
-
-    def get_by_teacher(
-        self,
-        teacher_id: str,
-        phase: Optional[str] = None,
-        subject: Optional[str] = None,
-        limit: int = 50
-    ) -> List[PhaseMaterialDB]:
-        """Holt alle Materialien eines Lehrers."""
-        query = self.db.query(PhaseMaterialDB).filter(
-            PhaseMaterialDB.teacher_id == teacher_id
-        )
-        if phase:
-            query = query.filter(PhaseMaterialDB.phase == phase)
-        if subject:
-            query = query.filter(PhaseMaterialDB.subject == subject)
-
-        return query.order_by(
-            PhaseMaterialDB.usage_count.desc(),
-            PhaseMaterialDB.created_at.desc()
-        ).limit(limit).all()
-
-    def get_by_phase(
-        self,
-        phase: str,
-        teacher_id: str,
-        include_public: bool = True
-    ) -> List[PhaseMaterialDB]:
-        """Holt alle Materialien fuer eine bestimmte Phase."""
-        if include_public:
-            return self.db.query(PhaseMaterialDB).filter(
-                PhaseMaterialDB.phase == phase,
-                (PhaseMaterialDB.teacher_id == teacher_id) |
-                (PhaseMaterialDB.is_public == True)
-            ).order_by(
-                PhaseMaterialDB.usage_count.desc()
-            ).all()
-        else:
-            return self.db.query(PhaseMaterialDB).filter(
-                PhaseMaterialDB.phase == phase,
-                PhaseMaterialDB.teacher_id == teacher_id
-            ).order_by(
-                PhaseMaterialDB.created_at.desc()
-            ).all()
-
-    def get_by_session(self, session_id: str) -> List[PhaseMaterialDB]:
-        """Holt alle Materialien einer Session."""
-        return self.db.query(PhaseMaterialDB).filter(
-            PhaseMaterialDB.session_id == session_id
-        ).order_by(PhaseMaterialDB.phase, PhaseMaterialDB.created_at).all()
-
-    def get_public_materials(
-        self,
-        phase: Optional[str] = None,
-        subject: Optional[str] = None,
-        limit: int = 20
-    ) -> List[PhaseMaterialDB]:
-        """Holt oeffentliche Materialien."""
-        query = self.db.query(PhaseMaterialDB).filter(
-            PhaseMaterialDB.is_public == True
-        )
-        if phase:
-            query = query.filter(PhaseMaterialDB.phase == phase)
-        if subject:
-            query = query.filter(PhaseMaterialDB.subject == subject)
-
-        return query.order_by(
-            PhaseMaterialDB.usage_count.desc()
-        ).limit(limit).all()
-
-    def search_by_tags(
-        self,
-        tags: List[str],
-        teacher_id: Optional[str] = None
-    ) -> List[PhaseMaterialDB]:
-        """Sucht Materialien nach Tags."""
-        # SQLite/PostgreSQL JSON contains
-        query = self.db.query(PhaseMaterialDB)
-        if teacher_id:
-            query = query.filter(
-                (PhaseMaterialDB.teacher_id == teacher_id) |
-                (PhaseMaterialDB.is_public == True)
-            )
-        else:
-            query = query.filter(PhaseMaterialDB.is_public == True)
-
-        # Filter by tags - vereinfachte Implementierung
-        results = []
-        for material in query.all():
-            if material.tags and any(tag in material.tags for tag in tags):
-                results.append(material)
-        return results[:50]
-
-    # ==================== UPDATE ====================
-
-    def update(self, material: PhaseMaterial) -> Optional[PhaseMaterialDB]:
-        """Aktualisiert ein Material."""
-        db_material = self.get_by_id(material.material_id)
-        if not db_material:
-            return None
-
-        db_material.title = material.title
-        db_material.material_type = MaterialTypeEnum(material.material_type.value)
-        db_material.url = material.url
-        db_material.description = material.description
-        db_material.phase = material.phase
-        db_material.subject = material.subject
-        db_material.grade_level = material.grade_level
-        db_material.tags = material.tags
-        db_material.is_public = material.is_public
-
-        self.db.commit()
-        self.db.refresh(db_material)
-        return db_material
-
-    def increment_usage(self, material_id: str) -> Optional[PhaseMaterialDB]:
-        """Erhoeht den Usage-Counter eines Materials."""
-        db_material = self.get_by_id(material_id)
-        if not db_material:
-            return None
-
-        db_material.usage_count += 1
-        self.db.commit()
-        self.db.refresh(db_material)
-        return db_material
-
-    def attach_to_session(
-        self,
-        material_id: str,
-        session_id: str
-    ) -> Optional[PhaseMaterialDB]:
-        """Verknuepft ein Material mit einer Session."""
-        db_material = self.get_by_id(material_id)
-        if not db_material:
-            return None
-
-        db_material.session_id = session_id
-        db_material.usage_count += 1
-        self.db.commit()
-        self.db.refresh(db_material)
-        return db_material
-
-    # ==================== DELETE ====================
-
-    def delete(self, material_id: str) -> bool:
-        """Loescht ein Material."""
-        db_material = self.get_by_id(material_id)
-        if not db_material:
-            return False
-
-        self.db.delete(db_material)
-        self.db.commit()
-        return True
-
-    # ==================== CONVERSION ====================
-
-    def to_dataclass(self, db_material: PhaseMaterialDB) -> PhaseMaterial:
-        """Konvertiert DB-Model zu Dataclass."""
-        return PhaseMaterial(
-            material_id=db_material.id,
-            teacher_id=db_material.teacher_id,
-            title=db_material.title,
-            material_type=MaterialType(db_material.material_type.value),
-            url=db_material.url,
-            description=db_material.description or "",
-            phase=db_material.phase,
-            subject=db_material.subject or "",
-            grade_level=db_material.grade_level or "",
-            tags=db_material.tags or [],
-            is_public=db_material.is_public,
-            usage_count=db_material.usage_count,
-            session_id=db_material.session_id,
-            created_at=db_material.created_at,
-            updated_at=db_material.updated_at,
-        )
-
-
-# ==================== REFLECTION REPOSITORY (Phase 5) ====================
-
-class ReflectionRepository:
-    """Repository fuer LessonReflection CRUD-Operationen."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    # ==================== CREATE ====================
-
-    def create(self, reflection: LessonReflection) -> LessonReflectionDB:
-        """Erstellt eine neue Reflection."""
-        db_reflection = LessonReflectionDB(
-            id=reflection.reflection_id,
-            session_id=reflection.session_id,
-            teacher_id=reflection.teacher_id,
-            notes=reflection.notes,
-            overall_rating=reflection.overall_rating,
-            what_worked=reflection.what_worked,
-            improvements=reflection.improvements,
-            notes_for_next_lesson=reflection.notes_for_next_lesson,
-        )
-        self.db.add(db_reflection)
-        self.db.commit()
-        self.db.refresh(db_reflection)
-        return db_reflection
-
-    # ==================== READ ====================
-
-    def get_by_id(self, reflection_id: str) -> Optional[LessonReflectionDB]:
-        """Holt eine Reflection nach ID."""
-        return self.db.query(LessonReflectionDB).filter(
-            LessonReflectionDB.id == reflection_id
-        ).first()
-
-    def get_by_session(self, session_id: str) -> Optional[LessonReflectionDB]:
-        """Holt die Reflection einer Session."""
-        return self.db.query(LessonReflectionDB).filter(
-            LessonReflectionDB.session_id == session_id
-        ).first()
-
-    def get_by_teacher(
-        self,
-        teacher_id: str,
-        limit: int = 20,
-        offset: int = 0
-    ) -> List[LessonReflectionDB]:
-        """Holt alle Reflections eines Lehrers."""
-        return self.db.query(LessonReflectionDB).filter(
-            LessonReflectionDB.teacher_id == teacher_id
-        ).order_by(
-            LessonReflectionDB.created_at.desc()
-        ).offset(offset).limit(limit).all()
-
-    # ==================== UPDATE ====================
-
-    def update(self, reflection: LessonReflection) -> Optional[LessonReflectionDB]:
-        """Aktualisiert eine Reflection."""
-        db_reflection = self.get_by_id(reflection.reflection_id)
-        if not db_reflection:
-            return None
-
-        db_reflection.notes = reflection.notes
-        db_reflection.overall_rating = reflection.overall_rating
-        db_reflection.what_worked = reflection.what_worked
-        db_reflection.improvements = reflection.improvements
-        db_reflection.notes_for_next_lesson = reflection.notes_for_next_lesson
-
-        self.db.commit()
-        self.db.refresh(db_reflection)
-        return db_reflection
-
-    # ==================== DELETE ====================
-
-    def delete(self, reflection_id: str) -> bool:
-        """Loescht eine Reflection."""
-        db_reflection = self.get_by_id(reflection_id)
-        if not db_reflection:
-            return False
-
-        self.db.delete(db_reflection)
-        self.db.commit()
-        return True
-
-    # ==================== CONVERSION ====================
-
-    def to_dataclass(self, db_reflection: LessonReflectionDB) -> LessonReflection:
-        """Konvertiert DB-Model zu Dataclass."""
-        return LessonReflection(
-            reflection_id=db_reflection.id,
-            session_id=db_reflection.session_id,
-            teacher_id=db_reflection.teacher_id,
-            notes=db_reflection.notes or "",
-            overall_rating=db_reflection.overall_rating,
-            what_worked=db_reflection.what_worked or [],
-            improvements=db_reflection.improvements or [],
-            notes_for_next_lesson=db_reflection.notes_for_next_lesson or "",
-            created_at=db_reflection.created_at,
-            updated_at=db_reflection.updated_at,
-        )
-
-
-# ==================== ANALYTICS REPOSITORY (Phase 5) ====================
-
-class AnalyticsRepository:
-    """Repository fuer Analytics-Abfragen."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    def get_session_summary(self, session_id: str) -> Optional[SessionSummary]:
-        """
-        Berechnet die Summary einer abgeschlossenen Session.
-
-        Args:
-            session_id: ID der Session
-
-        Returns:
-            SessionSummary oder None wenn Session nicht gefunden
-        """
-        db_session = self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.id == session_id
-        ).first()
-
-        if not db_session:
-            return None
-
-        # Session-Daten zusammenstellen
-        session_data = {
-            "session_id": db_session.id,
-            "teacher_id": db_session.teacher_id,
-            "class_id": db_session.class_id,
-            "subject": db_session.subject,
-            "topic": db_session.topic,
-            "lesson_started_at": db_session.lesson_started_at,
-            "lesson_ended_at": db_session.lesson_ended_at,
-            "phase_durations": db_session.phase_durations or {},
-        }
-
-        # Phase History aus DB oder JSON
-        phase_history = db_session.phase_history or []
-
-        # Summary berechnen
-        return AnalyticsCalculator.calculate_session_summary(
-            session_data, phase_history
-        )
-
-    def get_teacher_analytics(
-        self,
-        teacher_id: str,
-        period_start: Optional[datetime] = None,
-        period_end: Optional[datetime] = None
-    ) -> TeacherAnalytics:
-        """
-        Berechnet aggregierte Statistiken fuer einen Lehrer.
-
-        Args:
-            teacher_id: ID des Lehrers
-            period_start: Beginn des Zeitraums (default: 30 Tage zurueck)
-            period_end: Ende des Zeitraums (default: jetzt)
-
-        Returns:
-            TeacherAnalytics mit aggregierten Statistiken
-        """
-        from datetime import timedelta
-
-        if not period_end:
-            period_end = datetime.utcnow()
-        if not period_start:
-            period_start = period_end - timedelta(days=30)
-
-        # Sessions im Zeitraum abfragen
-        sessions_query = self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.teacher_id == teacher_id,
-            LessonSessionDB.lesson_started_at >= period_start,
-            LessonSessionDB.lesson_started_at <= period_end
-        ).all()
-
-        # Sessions zu Dictionaries konvertieren
-        sessions_data = []
-        for db_session in sessions_query:
-            sessions_data.append({
-                "session_id": db_session.id,
-                "teacher_id": db_session.teacher_id,
-                "class_id": db_session.class_id,
-                "subject": db_session.subject,
-                "topic": db_session.topic,
-                "lesson_started_at": db_session.lesson_started_at,
-                "lesson_ended_at": db_session.lesson_ended_at,
-                "phase_durations": db_session.phase_durations or {},
-                "phase_history": db_session.phase_history or [],
-            })
-
-        return AnalyticsCalculator.calculate_teacher_analytics(
-            sessions_data, period_start, period_end
-        )
-
-    def get_phase_duration_trends(
-        self,
-        teacher_id: str,
-        phase: str,
-        limit: int = 20
-    ) -> List[Dict[str, Any]]:
-        """
-        Gibt die Dauer-Trends fuer eine bestimmte Phase zurueck.
-
-        Args:
-            teacher_id: ID des Lehrers
-            phase: Phasen-ID (einstieg, erarbeitung, etc.)
-            limit: Max Anzahl der Datenpunkte
-
-        Returns:
-            Liste von Datenpunkten [{date, planned, actual, difference}]
-        """
-        sessions = self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.teacher_id == teacher_id,
-            LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
-        ).order_by(
-            LessonSessionDB.lesson_ended_at.desc()
-        ).limit(limit).all()
-
-        trends = []
-        for db_session in sessions:
-            history = db_session.phase_history or []
-            for entry in history:
-                if entry.get("phase") == phase:
-                    planned = (db_session.phase_durations or {}).get(phase, 0) * 60
-                    actual = entry.get("duration_seconds", 0) or 0
-                    trends.append({
-                        "date": db_session.lesson_started_at.isoformat() if db_session.lesson_started_at else None,
-                        "session_id": db_session.id,
-                        "subject": db_session.subject,
-                        "planned_seconds": planned,
-                        "actual_seconds": actual,
-                        "difference_seconds": actual - planned,
-                    })
-                    break
-
-        return list(reversed(trends))  # Chronologisch sortieren
-
-    def get_overtime_analysis(
-        self,
-        teacher_id: str,
-        limit: int = 30
-    ) -> Dict[str, Any]:
-        """
-        Analysiert Overtime-Muster.
-
-        Args:
-            teacher_id: ID des Lehrers
-            limit: Anzahl der zu analysierenden Sessions
-
-        Returns:
-            Dict mit Overtime-Statistiken pro Phase
-        """
-        sessions = self.db.query(LessonSessionDB).filter(
-            LessonSessionDB.teacher_id == teacher_id,
-            LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
-        ).order_by(
-            LessonSessionDB.lesson_ended_at.desc()
-        ).limit(limit).all()
-
-        phase_overtime: Dict[str, List[int]] = {
-            "einstieg": [],
-            "erarbeitung": [],
-            "sicherung": [],
-            "transfer": [],
-            "reflexion": [],
-        }
-
-        for db_session in sessions:
-            history = db_session.phase_history or []
-            phase_durations = db_session.phase_durations or {}
-
-            for entry in history:
-                phase = entry.get("phase", "")
-                if phase in phase_overtime:
-                    planned = phase_durations.get(phase, 0) * 60
-                    actual = entry.get("duration_seconds", 0) or 0
-                    overtime = max(0, actual - planned)
-                    phase_overtime[phase].append(overtime)
-
-        # Statistiken berechnen
-        result = {}
-        for phase, overtimes in phase_overtime.items():
-            if overtimes:
-                result[phase] = {
-                    "count": len([o for o in overtimes if o > 0]),
-                    "total": len(overtimes),
-                    "avg_overtime_seconds": sum(overtimes) / len(overtimes),
-                    "max_overtime_seconds": max(overtimes),
-                    "overtime_percentage": len([o for o in overtimes if o > 0]) / len(overtimes) * 100,
-                }
-            else:
-                result[phase] = {
-                    "count": 0,
-                    "total": 0,
-                    "avg_overtime_seconds": 0,
-                    "max_overtime_seconds": 0,
-                    "overtime_percentage": 0,
-                }
-
-        return result
-
-
-# ==================== TEACHER FEEDBACK REPOSITORY (Phase 7) ====================
-
-
-class TeacherFeedbackRepository:
-    """
-    Repository fuer Lehrer-Feedback CRUD-Operationen.
-
-    Ermoeglicht Lehrern, Feedback (Bugs, Feature-Requests, Verbesserungen)
-    direkt aus dem Lehrer-Frontend zu senden.
-    """
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    def create(
-        self,
-        teacher_id: str,
-        title: str,
-        description: str,
-        feedback_type: str = "improvement",
-        priority: str = "medium",
-        teacher_name: str = "",
-        teacher_email: str = "",
-        context_url: str = "",
-        context_phase: str = "",
-        context_session_id: str = None,
-        user_agent: str = "",
-        related_feature: str = None,
-    ) -> TeacherFeedbackDB:
-        """Erstellt neues Feedback."""
-        import uuid
-
-        db_feedback = TeacherFeedbackDB(
-            id=str(uuid.uuid4()),
-            teacher_id=teacher_id,
-            teacher_name=teacher_name,
-            teacher_email=teacher_email,
-            title=title,
-            description=description,
-            feedback_type=FeedbackTypeEnum(feedback_type),
-            priority=FeedbackPriorityEnum(priority),
-            status=FeedbackStatusEnum.NEW,
-            related_feature=related_feature,
-            context_url=context_url,
-            context_phase=context_phase,
-            context_session_id=context_session_id,
-            user_agent=user_agent,
-        )
-
-        self.db.add(db_feedback)
-        self.db.commit()
-        self.db.refresh(db_feedback)
-        return db_feedback
-
-    def get_by_id(self, feedback_id: str) -> Optional[TeacherFeedbackDB]:
-        """Holt Feedback nach ID."""
-        return self.db.query(TeacherFeedbackDB).filter(
-            TeacherFeedbackDB.id == feedback_id
-        ).first()
-
-    def get_all(
-        self,
-        status: str = None,
-        feedback_type: str = None,
-        limit: int = 100,
-        offset: int = 0
-    ) -> List[TeacherFeedbackDB]:
-        """Holt alle Feedbacks mit optionalen Filtern."""
-        query = self.db.query(TeacherFeedbackDB)
-
-        if status:
-            query = query.filter(TeacherFeedbackDB.status == FeedbackStatusEnum(status))
-        if feedback_type:
-            query = query.filter(TeacherFeedbackDB.feedback_type == FeedbackTypeEnum(feedback_type))
-
-        return query.order_by(
-            TeacherFeedbackDB.created_at.desc()
-        ).offset(offset).limit(limit).all()
-
-    def get_by_teacher(self, teacher_id: str, limit: int = 50) -> List[TeacherFeedbackDB]:
-        """Holt Feedback eines bestimmten Lehrers."""
-        return self.db.query(TeacherFeedbackDB).filter(
-            TeacherFeedbackDB.teacher_id == teacher_id
-        ).order_by(
-            TeacherFeedbackDB.created_at.desc()
-        ).limit(limit).all()
-
-    def update_status(
-        self,
-        feedback_id: str,
-        status: str,
-        response: str = None,
-        responded_by: str = None
-    ) -> Optional[TeacherFeedbackDB]:
-        """Aktualisiert den Status eines Feedbacks."""
-        db_feedback = self.get_by_id(feedback_id)
-        if not db_feedback:
-            return None
-
-        db_feedback.status = FeedbackStatusEnum(status)
-        if response:
-            db_feedback.response = response
-            db_feedback.responded_at = datetime.utcnow()
-            db_feedback.responded_by = responded_by
-
-        self.db.commit()
-        self.db.refresh(db_feedback)
-        return db_feedback
-
-    def delete(self, feedback_id: str) -> bool:
-        """Loescht ein Feedback."""
-        db_feedback = self.get_by_id(feedback_id)
-        if not db_feedback:
-            return False
-
-        self.db.delete(db_feedback)
-        self.db.commit()
-        return True
-
-    def get_stats(self) -> Dict[str, Any]:
-        """Gibt Statistiken ueber alle Feedbacks zurueck."""
-        all_feedback = self.db.query(TeacherFeedbackDB).all()
-
-        stats = {
-            "total": len(all_feedback),
-            "by_status": {},
-            "by_type": {},
-            "by_priority": {},
-        }
-
-        for fb in all_feedback:
-            # By Status
-            status = fb.status.value
-            stats["by_status"][status] = stats["by_status"].get(status, 0) + 1
-
-            # By Type
-            fb_type = fb.feedback_type.value
-            stats["by_type"][fb_type] = stats["by_type"].get(fb_type, 0) + 1
-
-            # By Priority
-            priority = fb.priority.value
-            stats["by_priority"][priority] = stats["by_priority"].get(priority, 0) + 1
-
-        return stats
-
-    def to_dict(self, db_feedback: TeacherFeedbackDB) -> Dict[str, Any]:
-        """Konvertiert DB-Model zu Dictionary."""
-        return {
-            "id": db_feedback.id,
-            "teacher_id": db_feedback.teacher_id,
-            "teacher_name": db_feedback.teacher_name,
-            "teacher_email": db_feedback.teacher_email,
-            "title": db_feedback.title,
-            "description": db_feedback.description,
-            "feedback_type": db_feedback.feedback_type.value,
-            "priority": db_feedback.priority.value,
-            "status": db_feedback.status.value,
-            "related_feature": db_feedback.related_feature,
-            "context_url": db_feedback.context_url,
-            "context_phase": db_feedback.context_phase,
-            "context_session_id": db_feedback.context_session_id,
-            "user_agent": db_feedback.user_agent,
-            "response": db_feedback.response,
-            "responded_at": db_feedback.responded_at.isoformat() if db_feedback.responded_at else None,
-            "responded_by": db_feedback.responded_by,
-            "created_at": db_feedback.created_at.isoformat() if db_feedback.created_at else None,
-            "updated_at": db_feedback.updated_at.isoformat() if db_feedback.updated_at else None,
-        }
-
-
-# ==================== Phase 8: Teacher Context Repository ====================
-
-
-class TeacherContextRepository:
-    """Repository fuer Lehrer-Kontext CRUD-Operationen (Phase 8)."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    # ==================== CREATE / GET-OR-CREATE ====================
-
-    def get_or_create(self, teacher_id: str) -> TeacherContextDB:
-        """
-        Holt den Kontext eines Lehrers oder erstellt einen neuen.
-
-        Args:
-            teacher_id: ID des Lehrers
-
-        Returns:
-            TeacherContextDB Model
-        """
-        context = self.get_by_teacher_id(teacher_id)
-        if context:
-            return context
-
-        # Neuen Kontext erstellen
-        from uuid import uuid4
-        context = TeacherContextDB(
-            id=str(uuid4()),
-            teacher_id=teacher_id,
-            macro_phase=MacroPhaseEnum.ONBOARDING,
-        )
-        self.db.add(context)
-        self.db.commit()
-        self.db.refresh(context)
-        return context
-
-    # ==================== READ ====================
-
-    def get_by_teacher_id(self, teacher_id: str) -> Optional[TeacherContextDB]:
-        """Holt den Kontext eines Lehrers."""
-        return self.db.query(TeacherContextDB).filter(
-            TeacherContextDB.teacher_id == teacher_id
-        ).first()
-
-    # ==================== UPDATE ====================
-
-    def update_context(
-        self,
-        teacher_id: str,
-        federal_state: str = None,
-        school_type: str = None,
-        schoolyear: str = None,
-        schoolyear_start: datetime = None,
-        macro_phase: str = None,
-        current_week: int = None,
-    ) -> Optional[TeacherContextDB]:
-        """Aktualisiert den Kontext eines Lehrers."""
-        context = self.get_or_create(teacher_id)
-
-        if federal_state is not None:
-            context.federal_state = federal_state
-        if school_type is not None:
-            context.school_type = school_type
-        if schoolyear is not None:
-            context.schoolyear = schoolyear
-        if schoolyear_start is not None:
-            context.schoolyear_start = schoolyear_start
-        if macro_phase is not None:
-            context.macro_phase = MacroPhaseEnum(macro_phase)
-        if current_week is not None:
-            context.current_week = current_week
-
-        self.db.commit()
-        self.db.refresh(context)
-        return context
-
-    def complete_onboarding(self, teacher_id: str) -> TeacherContextDB:
-        """Markiert Onboarding als abgeschlossen."""
-        context = self.get_or_create(teacher_id)
-        context.onboarding_completed = True
-        context.macro_phase = MacroPhaseEnum.SCHULJAHRESSTART
-        self.db.commit()
-        self.db.refresh(context)
-        return context
-
-    def update_flags(
-        self,
-        teacher_id: str,
-        has_classes: bool = None,
-        has_schedule: bool = None,
-        is_exam_period: bool = None,
-        is_before_holidays: bool = None,
-    ) -> TeacherContextDB:
-        """Aktualisiert die Status-Flags eines Kontexts."""
-        context = self.get_or_create(teacher_id)
-
-        if has_classes is not None:
-            context.has_classes = has_classes
-        if has_schedule is not None:
-            context.has_schedule = has_schedule
-        if is_exam_period is not None:
-            context.is_exam_period = is_exam_period
-        if is_before_holidays is not None:
-            context.is_before_holidays = is_before_holidays
-
-        self.db.commit()
-        self.db.refresh(context)
-        return context
-
-    def to_dict(self, context: TeacherContextDB) -> Dict[str, Any]:
-        """Konvertiert DB-Model zu Dictionary."""
-        return {
-            "id": context.id,
-            "teacher_id": context.teacher_id,
-            "school": {
-                "federal_state": context.federal_state,
-                "federal_state_name": FEDERAL_STATES.get(context.federal_state, ""),
-                "school_type": context.school_type,
-                "school_type_name": SCHOOL_TYPES.get(context.school_type, ""),
-            },
-            "school_year": {
-                "id": context.schoolyear,
-                "start": context.schoolyear_start.isoformat() if context.schoolyear_start else None,
-                "current_week": context.current_week,
-            },
-            "macro_phase": {
-                "id": context.macro_phase.value,
-                "label": self._get_phase_label(context.macro_phase),
-            },
-            "flags": {
-                "onboarding_completed": context.onboarding_completed,
-                "has_classes": context.has_classes,
-                "has_schedule": context.has_schedule,
-                "is_exam_period": context.is_exam_period,
-                "is_before_holidays": context.is_before_holidays,
-            },
-            "created_at": context.created_at.isoformat() if context.created_at else None,
-            "updated_at": context.updated_at.isoformat() if context.updated_at else None,
-        }
-
-    def _get_phase_label(self, phase: MacroPhaseEnum) -> str:
-        """Gibt den Anzeigenamen einer Makro-Phase zurueck."""
-        labels = {
-            MacroPhaseEnum.ONBOARDING: "Einrichtung",
-            MacroPhaseEnum.SCHULJAHRESSTART: "Schuljahresstart",
-            MacroPhaseEnum.UNTERRICHTSAUFBAU: "Unterrichtsaufbau",
-            MacroPhaseEnum.LEISTUNGSPHASE_1: "Leistungsphase 1",
-            MacroPhaseEnum.HALBJAHRESABSCHLUSS: "Halbjahresabschluss",
-            MacroPhaseEnum.LEISTUNGSPHASE_2: "Leistungsphase 2",
-            MacroPhaseEnum.JAHRESABSCHLUSS: "Jahresabschluss",
-        }
-        return labels.get(phase, phase.value)
-
-
-# ==================== Phase 8: Schoolyear Event Repository ====================
-
-
-class SchoolyearEventRepository:
-    """Repository fuer Schuljahr-Events (Phase 8)."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    def create(
-        self,
-        teacher_id: str,
-        title: str,
-        start_date: datetime,
-        event_type: str = "other",
-        end_date: datetime = None,
-        class_id: str = None,
-        subject: str = None,
-        description: str = "",
-        needs_preparation: bool = True,
-        reminder_days_before: int = 7,
-        extra_data: Dict[str, Any] = None,
-    ) -> SchoolyearEventDB:
-        """Erstellt ein neues Schuljahr-Event."""
-        from uuid import uuid4
-        event = SchoolyearEventDB(
-            id=str(uuid4()),
-            teacher_id=teacher_id,
-            title=title,
-            event_type=EventTypeEnum(event_type),
-            start_date=start_date,
-            end_date=end_date,
-            class_id=class_id,
-            subject=subject,
-            description=description,
-            needs_preparation=needs_preparation,
-            reminder_days_before=reminder_days_before,
-            extra_data=extra_data or {},
-        )
-        self.db.add(event)
-        self.db.commit()
-        self.db.refresh(event)
-        return event
-
-    def get_by_id(self, event_id: str) -> Optional[SchoolyearEventDB]:
-        """Holt ein Event nach ID."""
-        return self.db.query(SchoolyearEventDB).filter(
-            SchoolyearEventDB.id == event_id
-        ).first()
-
-    def get_by_teacher(
-        self,
-        teacher_id: str,
-        status: str = None,
-        event_type: str = None,
-        limit: int = 50,
-    ) -> List[SchoolyearEventDB]:
-        """Holt Events eines Lehrers."""
-        query = self.db.query(SchoolyearEventDB).filter(
-            SchoolyearEventDB.teacher_id == teacher_id
-        )
-        if status:
-            query = query.filter(SchoolyearEventDB.status == EventStatusEnum(status))
-        if event_type:
-            query = query.filter(SchoolyearEventDB.event_type == EventTypeEnum(event_type))
-
-        return query.order_by(SchoolyearEventDB.start_date).limit(limit).all()
-
-    def get_upcoming(
-        self,
-        teacher_id: str,
-        days: int = 30,
-        limit: int = 10,
-    ) -> List[SchoolyearEventDB]:
-        """Holt anstehende Events der naechsten X Tage."""
-        from datetime import timedelta
-        now = datetime.utcnow()
-        end = now + timedelta(days=days)
-
-        return self.db.query(SchoolyearEventDB).filter(
-            SchoolyearEventDB.teacher_id == teacher_id,
-            SchoolyearEventDB.start_date >= now,
-            SchoolyearEventDB.start_date <= end,
-            SchoolyearEventDB.status != EventStatusEnum.CANCELLED,
-        ).order_by(SchoolyearEventDB.start_date).limit(limit).all()
-
-    def update_status(
-        self,
-        event_id: str,
-        status: str,
-        preparation_done: bool = None,
-    ) -> Optional[SchoolyearEventDB]:
-        """Aktualisiert den Status eines Events."""
-        event = self.get_by_id(event_id)
-        if not event:
-            return None
-
-        event.status = EventStatusEnum(status)
-        if preparation_done is not None:
-            event.preparation_done = preparation_done
-
-        self.db.commit()
-        self.db.refresh(event)
-        return event
-
-    def delete(self, event_id: str) -> bool:
-        """Loescht ein Event."""
-        event = self.get_by_id(event_id)
-        if not event:
-            return False
-        self.db.delete(event)
-        self.db.commit()
-        return True
-
-    def to_dict(self, event: SchoolyearEventDB) -> Dict[str, Any]:
-        """Konvertiert DB-Model zu Dictionary."""
-        return {
-            "id": event.id,
-            "teacher_id": event.teacher_id,
-            "event_type": event.event_type.value,
-            "title": event.title,
-            "description": event.description,
-            "start_date": event.start_date.isoformat() if event.start_date else None,
-            "end_date": event.end_date.isoformat() if event.end_date else None,
-            "class_id": event.class_id,
-            "subject": event.subject,
-            "status": event.status.value,
-            "needs_preparation": event.needs_preparation,
-            "preparation_done": event.preparation_done,
-            "reminder_days_before": event.reminder_days_before,
-            "extra_data": event.extra_data,
-            "created_at": event.created_at.isoformat() if event.created_at else None,
-        }
-
-
-# ==================== Phase 8: Recurring Routine Repository ====================
-
-
-class RecurringRoutineRepository:
-    """Repository fuer wiederkehrende Routinen (Phase 8)."""
-
-    def __init__(self, db: DBSession):
-        self.db = db
-
-    def create(
-        self,
-        teacher_id: str,
-        title: str,
-        routine_type: str = "other",
-        recurrence_pattern: str = "weekly",
-        day_of_week: int = None,
-        day_of_month: int = None,
-        time_of_day: str = None,  # Format: "14:00"
-        duration_minutes: int = 60,
-        description: str = "",
-        valid_from: datetime = None,
-        valid_until: datetime = None,
-    ) -> RecurringRoutineDB:
-        """Erstellt eine neue wiederkehrende Routine."""
-        from uuid import uuid4
-        from datetime import time as dt_time
-
-        time_obj = None
-        if time_of_day:
-            parts = time_of_day.split(":")
-            time_obj = dt_time(int(parts[0]), int(parts[1]))
-
-        routine = RecurringRoutineDB(
-            id=str(uuid4()),
-            teacher_id=teacher_id,
-            title=title,
-            routine_type=RoutineTypeEnum(routine_type),
-            recurrence_pattern=RecurrencePatternEnum(recurrence_pattern),
-            day_of_week=day_of_week,
-            day_of_month=day_of_month,
-            time_of_day=time_obj,
-            duration_minutes=duration_minutes,
-            description=description,
-            valid_from=valid_from,
-            valid_until=valid_until,
-        )
-        self.db.add(routine)
-        self.db.commit()
-        self.db.refresh(routine)
-        return routine
-
-    def get_by_id(self, routine_id: str) -> Optional[RecurringRoutineDB]:
-        """Holt eine Routine nach ID."""
-        return self.db.query(RecurringRoutineDB).filter(
-            RecurringRoutineDB.id == routine_id
-        ).first()
-
-    def get_by_teacher(
-        self,
-        teacher_id: str,
-        is_active: bool = True,
-        routine_type: str = None,
-    ) -> List[RecurringRoutineDB]:
-        """Holt Routinen eines Lehrers."""
-        query = self.db.query(RecurringRoutineDB).filter(
-            RecurringRoutineDB.teacher_id == teacher_id
-        )
-        if is_active is not None:
-            query = query.filter(RecurringRoutineDB.is_active == is_active)
-        if routine_type:
-            query = query.filter(RecurringRoutineDB.routine_type == RoutineTypeEnum(routine_type))
-
-        return query.all()
-
-    def get_today(self, teacher_id: str) -> List[RecurringRoutineDB]:
-        """Holt Routinen die heute stattfinden."""
-        today = datetime.utcnow()
-        day_of_week = today.weekday()  # 0 = Montag
-        day_of_month = today.day
-
-        routines = self.get_by_teacher(teacher_id, is_active=True)
-        today_routines = []
-
-        for routine in routines:
-            if routine.recurrence_pattern == RecurrencePatternEnum.DAILY:
-                today_routines.append(routine)
-            elif routine.recurrence_pattern == RecurrencePatternEnum.WEEKLY:
-                if routine.day_of_week == day_of_week:
-                    today_routines.append(routine)
-            elif routine.recurrence_pattern == RecurrencePatternEnum.BIWEEKLY:
-                # Vereinfacht: Pruefen ob Tag passt (echte Logik braucht Startdatum)
-                if routine.day_of_week == day_of_week:
-                    today_routines.append(routine)
-            elif routine.recurrence_pattern == RecurrencePatternEnum.MONTHLY:
-                if routine.day_of_month == day_of_month:
-                    today_routines.append(routine)
-
-        return today_routines
-
-    def update(
-        self,
-        routine_id: str,
-        title: str = None,
-        is_active: bool = None,
-        day_of_week: int = None,
-        time_of_day: str = None,
-    ) -> Optional[RecurringRoutineDB]:
-        """Aktualisiert eine Routine."""
-        routine = self.get_by_id(routine_id)
-        if not routine:
-            return None
-
-        if title is not None:
-            routine.title = title
-        if is_active is not None:
-            routine.is_active = is_active
-        if day_of_week is not None:
-            routine.day_of_week = day_of_week
-        if time_of_day is not None:
-            from datetime import time as dt_time
-            parts = time_of_day.split(":")
-            routine.time_of_day = dt_time(int(parts[0]), int(parts[1]))
-
-        self.db.commit()
-        self.db.refresh(routine)
-        return routine
-
-    def delete(self, routine_id: str) -> bool:
-        """Loescht eine Routine."""
-        routine = self.get_by_id(routine_id)
-        if not routine:
-            return False
-        self.db.delete(routine)
-        self.db.commit()
-        return True
-
-    def to_dict(self, routine: RecurringRoutineDB) -> Dict[str, Any]:
-        """Konvertiert DB-Model zu Dictionary."""
-        return {
-            "id": routine.id,
-            "teacher_id": routine.teacher_id,
-            "routine_type": routine.routine_type.value,
-            "title": routine.title,
-            "description": routine.description,
-            "recurrence_pattern": routine.recurrence_pattern.value,
-            "day_of_week": routine.day_of_week,
-            "day_of_month": routine.day_of_month,
-            "time_of_day": routine.time_of_day.isoformat() if routine.time_of_day else None,
-            "duration_minutes": routine.duration_minutes,
-            "is_active": routine.is_active,
-            "valid_from": routine.valid_from.isoformat() if routine.valid_from else None,
-            "valid_until": routine.valid_until.isoformat() if routine.valid_until else None,
-            "created_at": routine.created_at.isoformat() if routine.created_at else None,
-        }
+__all__ = [
+    "SessionRepository",
+    "TeacherSettingsRepository",
+    "TemplateRepository",
+    "HomeworkRepository",
+    "MaterialRepository",
+    "ReflectionRepository",
+    "AnalyticsRepository",
+    "TeacherFeedbackRepository",
+    "TeacherContextRepository",
+    "SchoolyearEventRepository",
+    "RecurringRoutineRepository",
+]
diff --git a/backend-lehrer/classroom_engine/repository_context.py b/backend-lehrer/classroom_engine/repository_context.py
new file mode 100644
index 0000000..5bbfe42
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_context.py
@@ -0,0 +1,453 @@
+"""
+Teacher Context, Schoolyear Event & Recurring Routine Repositories.
+
+CRUD-Operationen fuer Schuljahres-Kontext (Phase 8).
+"""
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from sqlalchemy.orm import Session as DBSession
+
+from .context_models import (
+    TeacherContextDB, SchoolyearEventDB, RecurringRoutineDB,
+    MacroPhaseEnum, EventTypeEnum, EventStatusEnum,
+    RoutineTypeEnum, RecurrencePatternEnum,
+    FEDERAL_STATES, SCHOOL_TYPES,
+)
+
+
+class TeacherContextRepository:
+    """Repository fuer Lehrer-Kontext CRUD-Operationen (Phase 8)."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    # ==================== CREATE / GET-OR-CREATE ====================
+
+    def get_or_create(self, teacher_id: str) -> TeacherContextDB:
+        """
+        Holt den Kontext eines Lehrers oder erstellt einen neuen.
+
+        Args:
+            teacher_id: ID des Lehrers
+
+        Returns:
+            TeacherContextDB Model
+        """
+        context = self.get_by_teacher_id(teacher_id)
+        if context:
+            return context
+
+        # Neuen Kontext erstellen
+        from uuid import uuid4
+        context = TeacherContextDB(
+            id=str(uuid4()),
+            teacher_id=teacher_id,
+            macro_phase=MacroPhaseEnum.ONBOARDING,
+        )
+        self.db.add(context)
+        self.db.commit()
+        self.db.refresh(context)
+        return context
+
+    # ==================== READ ====================
+
+    def get_by_teacher_id(self, teacher_id: str) -> Optional[TeacherContextDB]:
+        """Holt den Kontext eines Lehrers."""
+        return self.db.query(TeacherContextDB).filter(
+            TeacherContextDB.teacher_id == teacher_id
+        ).first()
+
+    # ==================== UPDATE ====================
+
+    def update_context(
+        self,
+        teacher_id: str,
+        federal_state: str = None,
+        school_type: str = None,
+        schoolyear: str = None,
+        schoolyear_start: datetime = None,
+        macro_phase: str = None,
+        current_week: int = None,
+    ) -> Optional[TeacherContextDB]:
+        """Aktualisiert den Kontext eines Lehrers."""
+        context = self.get_or_create(teacher_id)
+
+        if federal_state is not None:
+            context.federal_state = federal_state
+        if school_type is not None:
+            context.school_type = school_type
+        if schoolyear is not None:
+            context.schoolyear = schoolyear
+        if schoolyear_start is not None:
+            context.schoolyear_start = schoolyear_start
+        if macro_phase is not None:
+            context.macro_phase = MacroPhaseEnum(macro_phase)
+        if current_week is not None:
+            context.current_week = current_week
+
+        self.db.commit()
+        self.db.refresh(context)
+        return context
+
+    def complete_onboarding(self, teacher_id: str) -> TeacherContextDB:
+        """Markiert Onboarding als abgeschlossen."""
+        context = self.get_or_create(teacher_id)
+        context.onboarding_completed = True
+        context.macro_phase = MacroPhaseEnum.SCHULJAHRESSTART
+        self.db.commit()
+        self.db.refresh(context)
+        return context
+
+    def update_flags(
+        self,
+        teacher_id: str,
+        has_classes: bool = None,
+        has_schedule: bool = None,
+        is_exam_period: bool = None,
+        is_before_holidays: bool = None,
+    ) -> TeacherContextDB:
+        """Aktualisiert die Status-Flags eines Kontexts."""
+        context = self.get_or_create(teacher_id)
+
+        if has_classes is not None:
+            context.has_classes = has_classes
+        if has_schedule is not None:
+            context.has_schedule = has_schedule
+        if is_exam_period is not None:
+            context.is_exam_period = is_exam_period
+        if is_before_holidays is not None:
+            context.is_before_holidays = is_before_holidays
+
+        self.db.commit()
+        self.db.refresh(context)
+        return context
+
+    def to_dict(self, context: TeacherContextDB) -> Dict[str, Any]:
+        """Konvertiert DB-Model zu Dictionary."""
+        return {
+            "id": context.id,
+            "teacher_id": context.teacher_id,
+            "school": {
+                "federal_state": context.federal_state,
+                "federal_state_name": FEDERAL_STATES.get(context.federal_state, ""),
+                "school_type": context.school_type,
+                "school_type_name": SCHOOL_TYPES.get(context.school_type, ""),
+            },
+            "school_year": {
+                "id": context.schoolyear,
+                "start": context.schoolyear_start.isoformat() if context.schoolyear_start else None,
+                "current_week": context.current_week,
+            },
+            "macro_phase": {
+                "id": context.macro_phase.value,
+                "label": self._get_phase_label(context.macro_phase),
+            },
+            "flags": {
+                "onboarding_completed": context.onboarding_completed,
+                "has_classes": context.has_classes,
+                "has_schedule": context.has_schedule,
+                "is_exam_period": context.is_exam_period,
+                "is_before_holidays": context.is_before_holidays,
+            },
+            "created_at": context.created_at.isoformat() if context.created_at else None,
+            "updated_at": context.updated_at.isoformat() if context.updated_at else None,
+        }
+
+    def _get_phase_label(self, phase: MacroPhaseEnum) -> str:
+        """Gibt den Anzeigenamen einer Makro-Phase zurueck."""
+        labels = {
+            MacroPhaseEnum.ONBOARDING: "Einrichtung",
+            MacroPhaseEnum.SCHULJAHRESSTART: "Schuljahresstart",
+            MacroPhaseEnum.UNTERRICHTSAUFBAU: "Unterrichtsaufbau",
+            MacroPhaseEnum.LEISTUNGSPHASE_1: "Leistungsphase 1",
+            MacroPhaseEnum.HALBJAHRESABSCHLUSS: "Halbjahresabschluss",
+            MacroPhaseEnum.LEISTUNGSPHASE_2: "Leistungsphase 2",
+            MacroPhaseEnum.JAHRESABSCHLUSS: "Jahresabschluss",
+        }
+        return labels.get(phase, phase.value)
+
+
+class SchoolyearEventRepository:
+    """Repository fuer Schuljahr-Events (Phase 8)."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    def create(
+        self,
+        teacher_id: str,
+        title: str,
+        start_date: datetime,
+        event_type: str = "other",
+        end_date: datetime = None,
+        class_id: str = None,
+        subject: str = None,
+        description: str = "",
+        needs_preparation: bool = True,
+        reminder_days_before: int = 7,
+        extra_data: Dict[str, Any] = None,
+    ) -> SchoolyearEventDB:
+        """Erstellt ein neues Schuljahr-Event."""
+        from uuid import uuid4
+        event = SchoolyearEventDB(
+            id=str(uuid4()),
+            teacher_id=teacher_id,
+            title=title,
+            event_type=EventTypeEnum(event_type),
+            start_date=start_date,
+            end_date=end_date,
+            class_id=class_id,
+            subject=subject,
+            description=description,
+            needs_preparation=needs_preparation,
+            reminder_days_before=reminder_days_before,
+            extra_data=extra_data or {},
+        )
+        self.db.add(event)
+        self.db.commit()
+        self.db.refresh(event)
+        return event
+
+    def get_by_id(self, event_id: str) -> Optional[SchoolyearEventDB]:
+        """Holt ein Event nach ID."""
+        return self.db.query(SchoolyearEventDB).filter(
+            SchoolyearEventDB.id == event_id
+        ).first()
+
+    def get_by_teacher(
+        self,
+        teacher_id: str,
+        status: str = None,
+        event_type: str = None,
+        limit: int = 50,
+    ) -> List[SchoolyearEventDB]:
+        """Holt Events eines Lehrers."""
+        query = self.db.query(SchoolyearEventDB).filter(
+            SchoolyearEventDB.teacher_id == teacher_id
+        )
+        if status:
+            query = query.filter(SchoolyearEventDB.status == EventStatusEnum(status))
+        if event_type:
+            query = query.filter(SchoolyearEventDB.event_type == EventTypeEnum(event_type))
+
+        return query.order_by(SchoolyearEventDB.start_date).limit(limit).all()
+
+    def get_upcoming(
+        self,
+        teacher_id: str,
+        days: int = 30,
+        limit: int = 10,
+    ) -> List[SchoolyearEventDB]:
+        """Holt anstehende Events der naechsten X Tage."""
+        from datetime import timedelta
+        now = datetime.utcnow()
+        end = now + timedelta(days=days)
+
+        return self.db.query(SchoolyearEventDB).filter(
+            SchoolyearEventDB.teacher_id == teacher_id,
+            SchoolyearEventDB.start_date >= now,
+            SchoolyearEventDB.start_date <= end,
+            SchoolyearEventDB.status != EventStatusEnum.CANCELLED,
+        ).order_by(SchoolyearEventDB.start_date).limit(limit).all()
+
+    def update_status(
+        self,
+        event_id: str,
+        status: str,
+        preparation_done: bool = None,
+    ) -> Optional[SchoolyearEventDB]:
+        """Aktualisiert den Status eines Events."""
+        event = self.get_by_id(event_id)
+        if not event:
+            return None
+
+        event.status = EventStatusEnum(status)
+        if preparation_done is not None:
+            event.preparation_done = preparation_done
+
+        self.db.commit()
+        self.db.refresh(event)
+        return event
+
+    def delete(self, event_id: str) -> bool:
+        """Loescht ein Event."""
+        event = self.get_by_id(event_id)
+        if not event:
+            return False
+        self.db.delete(event)
+        self.db.commit()
+        return True
+
+    def to_dict(self, event: SchoolyearEventDB) -> Dict[str, Any]:
+        """Konvertiert DB-Model zu Dictionary."""
+        return {
+            "id": event.id,
+            "teacher_id": event.teacher_id,
+            "event_type": event.event_type.value,
+            "title": event.title,
+            "description": event.description,
+            "start_date": event.start_date.isoformat() if event.start_date else None,
+            "end_date": event.end_date.isoformat() if event.end_date else None,
+            "class_id": event.class_id,
+            "subject": event.subject,
+            "status": event.status.value,
+            "needs_preparation": event.needs_preparation,
+            "preparation_done": event.preparation_done,
+            "reminder_days_before": event.reminder_days_before,
+            "extra_data": event.extra_data,
+            "created_at": event.created_at.isoformat() if event.created_at else None,
+        }
+
+
+class RecurringRoutineRepository:
+    """Repository fuer wiederkehrende Routinen (Phase 8)."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    def create(
+        self,
+        teacher_id: str,
+        title: str,
+        routine_type: str = "other",
+        recurrence_pattern: str = "weekly",
+        day_of_week: int = None,
+        day_of_month: int = None,
+        time_of_day: str = None,  # Format: "14:00"
+        duration_minutes: int = 60,
+        description: str = "",
+        valid_from: datetime = None,
+        valid_until: datetime = None,
+    ) -> RecurringRoutineDB:
+        """Erstellt eine neue wiederkehrende Routine."""
+        from uuid import uuid4
+        from datetime import time as dt_time
+
+        time_obj = None
+        if time_of_day:
+            parts = time_of_day.split(":")
+            time_obj = dt_time(int(parts[0]), int(parts[1]))
+
+        routine = RecurringRoutineDB(
+            id=str(uuid4()),
+            teacher_id=teacher_id,
+            title=title,
+            routine_type=RoutineTypeEnum(routine_type),
+            recurrence_pattern=RecurrencePatternEnum(recurrence_pattern),
+            day_of_week=day_of_week,
+            day_of_month=day_of_month,
+            time_of_day=time_obj,
+            duration_minutes=duration_minutes,
+            description=description,
+            valid_from=valid_from,
+            valid_until=valid_until,
+        )
+        self.db.add(routine)
+        self.db.commit()
+        self.db.refresh(routine)
+        return routine
+
+    def get_by_id(self, routine_id: str) -> Optional[RecurringRoutineDB]:
+        """Holt eine Routine nach ID."""
+        return self.db.query(RecurringRoutineDB).filter(
+            RecurringRoutineDB.id == routine_id
+        ).first()
+
+    def get_by_teacher(
+        self,
+        teacher_id: str,
+        is_active: bool = True,
+        routine_type: str = None,
+    ) -> List[RecurringRoutineDB]:
+        """Holt Routinen eines Lehrers."""
+        query = self.db.query(RecurringRoutineDB).filter(
+            RecurringRoutineDB.teacher_id == teacher_id
+        )
+        if is_active is not None:
+            query = query.filter(RecurringRoutineDB.is_active == is_active)
+        if routine_type:
+            query = query.filter(RecurringRoutineDB.routine_type == RoutineTypeEnum(routine_type))
+
+        return query.all()
+
+    def get_today(self, teacher_id: str) -> List[RecurringRoutineDB]:
+        """Holt Routinen die heute stattfinden."""
+        today = datetime.utcnow()
+        day_of_week = today.weekday()  # 0 = Montag
+        day_of_month = today.day
+
+        routines = self.get_by_teacher(teacher_id, is_active=True)
+        today_routines = []
+
+        for routine in routines:
+            if routine.recurrence_pattern == RecurrencePatternEnum.DAILY:
+                today_routines.append(routine)
+            elif routine.recurrence_pattern == RecurrencePatternEnum.WEEKLY:
+                if routine.day_of_week == day_of_week:
+                    today_routines.append(routine)
+            elif routine.recurrence_pattern == RecurrencePatternEnum.BIWEEKLY:
+                # Vereinfacht: Pruefen ob Tag passt (echte Logik braucht Startdatum)
+                if routine.day_of_week == day_of_week:
+                    today_routines.append(routine)
+            elif routine.recurrence_pattern == RecurrencePatternEnum.MONTHLY:
+                if routine.day_of_month == day_of_month:
+                    today_routines.append(routine)
+
+        return today_routines
+
+    def update(
+        self,
+        routine_id: str,
+        title: str = None,
+        is_active: bool = None,
+        day_of_week: int = None,
+        time_of_day: str = None,
+    ) -> Optional[RecurringRoutineDB]:
+        """Aktualisiert eine Routine."""
+        routine = self.get_by_id(routine_id)
+        if not routine:
+            return None
+
+        if title is not None:
+            routine.title = title
+        if is_active is not None:
+            routine.is_active = is_active
+        if day_of_week is not None:
+            routine.day_of_week = day_of_week
+        if time_of_day is not None:
+            from datetime import time as dt_time
+            parts = time_of_day.split(":")
+            routine.time_of_day = dt_time(int(parts[0]), int(parts[1]))
+
+        self.db.commit()
+        self.db.refresh(routine)
+        return routine
+
+    def delete(self, routine_id: str) -> bool:
+        """Loescht eine Routine."""
+        routine = self.get_by_id(routine_id)
+        if not routine:
+            return False
+        self.db.delete(routine)
+        self.db.commit()
+        return True
+
+    def to_dict(self, routine: RecurringRoutineDB) -> Dict[str, Any]:
+        """Konvertiert DB-Model zu Dictionary."""
+        return {
+            "id": routine.id,
+            "teacher_id": routine.teacher_id,
+            "routine_type": routine.routine_type.value,
+            "title": routine.title,
+            "description": routine.description,
+            "recurrence_pattern": routine.recurrence_pattern.value,
+            "day_of_week": routine.day_of_week,
+            "day_of_month": routine.day_of_month,
+            "time_of_day": routine.time_of_day.isoformat() if routine.time_of_day else None,
+            "duration_minutes": routine.duration_minutes,
+            "is_active": routine.is_active,
+            "valid_from": routine.valid_from.isoformat() if routine.valid_from else None,
+            "valid_until": routine.valid_until.isoformat() if routine.valid_until else None,
+            "created_at": routine.created_at.isoformat() if routine.created_at else None,
+        }
diff --git a/backend-lehrer/classroom_engine/repository_feedback.py b/backend-lehrer/classroom_engine/repository_feedback.py
new file mode 100644
index 0000000..192b3e2
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_feedback.py
@@ -0,0 +1,182 @@
+"""
+Teacher Feedback Repository.
+
+CRUD-Operationen fuer Lehrer-Feedback (Phase 7).
+Ermoeglicht Lehrern, Bugs, Feature-Requests und Verbesserungen zu melden.
+"""
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import (
+    TeacherFeedbackDB, FeedbackTypeEnum, FeedbackStatusEnum,
+    FeedbackPriorityEnum,
+)
+
+
+class TeacherFeedbackRepository:
+    """
+    Repository fuer Lehrer-Feedback CRUD-Operationen.
+
+    Ermoeglicht Lehrern, Feedback (Bugs, Feature-Requests, Verbesserungen)
+    direkt aus dem Lehrer-Frontend zu senden.
+    """
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    def create(
+        self,
+        teacher_id: str,
+        title: str,
+        description: str,
+        feedback_type: str = "improvement",
+        priority: str = "medium",
+        teacher_name: str = "",
+        teacher_email: str = "",
+        context_url: str = "",
+        context_phase: str = "",
+        context_session_id: str = None,
+        user_agent: str = "",
+        related_feature: str = None,
+    ) -> TeacherFeedbackDB:
+        """Erstellt neues Feedback."""
+        import uuid
+
+        db_feedback = TeacherFeedbackDB(
+            id=str(uuid.uuid4()),
+            teacher_id=teacher_id,
+            teacher_name=teacher_name,
+            teacher_email=teacher_email,
+            title=title,
+            description=description,
+            feedback_type=FeedbackTypeEnum(feedback_type),
+            priority=FeedbackPriorityEnum(priority),
+            status=FeedbackStatusEnum.NEW,
+            related_feature=related_feature,
+            context_url=context_url,
+            context_phase=context_phase,
+            context_session_id=context_session_id,
+            user_agent=user_agent,
+        )
+
+        self.db.add(db_feedback)
+        self.db.commit()
+        self.db.refresh(db_feedback)
+        return db_feedback
+
+    def get_by_id(self, feedback_id: str) -> Optional[TeacherFeedbackDB]:
+        """Holt Feedback nach ID."""
+        return self.db.query(TeacherFeedbackDB).filter(
+            TeacherFeedbackDB.id == feedback_id
+        ).first()
+
+    def get_all(
+        self,
+        status: str = None,
+        feedback_type: str = None,
+        limit: int = 100,
+        offset: int = 0
+    ) -> List[TeacherFeedbackDB]:
+        """Holt alle Feedbacks mit optionalen Filtern."""
+        query = self.db.query(TeacherFeedbackDB)
+
+        if status:
+            query = query.filter(TeacherFeedbackDB.status == FeedbackStatusEnum(status))
+        if feedback_type:
+            query = query.filter(TeacherFeedbackDB.feedback_type == FeedbackTypeEnum(feedback_type))
+
+        return query.order_by(
+            TeacherFeedbackDB.created_at.desc()
+        ).offset(offset).limit(limit).all()
+
+    def get_by_teacher(self, teacher_id: str, limit: int = 50) -> List[TeacherFeedbackDB]:
+        """Holt Feedback eines bestimmten Lehrers."""
+        return self.db.query(TeacherFeedbackDB).filter(
+            TeacherFeedbackDB.teacher_id == teacher_id
+        ).order_by(
+            TeacherFeedbackDB.created_at.desc()
+        ).limit(limit).all()
+
+    def update_status(
+        self,
+        feedback_id: str,
+        status: str,
+        response: str = None,
+        responded_by: str = None
+    ) -> Optional[TeacherFeedbackDB]:
+        """Aktualisiert den Status eines Feedbacks."""
+        db_feedback = self.get_by_id(feedback_id)
+        if not db_feedback:
+            return None
+
+        db_feedback.status = FeedbackStatusEnum(status)
+        if response:
+            db_feedback.response = response
+            db_feedback.responded_at = datetime.utcnow()
+            db_feedback.responded_by = responded_by
+
+        self.db.commit()
+        self.db.refresh(db_feedback)
+        return db_feedback
+
+    def delete(self, feedback_id: str) -> bool:
+        """Loescht ein Feedback."""
+        db_feedback = self.get_by_id(feedback_id)
+        if not db_feedback:
+            return False
+
+        self.db.delete(db_feedback)
+        self.db.commit()
+        return True
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Gibt Statistiken ueber alle Feedbacks zurueck."""
+        all_feedback = self.db.query(TeacherFeedbackDB).all()
+
+        stats = {
+            "total": len(all_feedback),
+            "by_status": {},
+            "by_type": {},
+            "by_priority": {},
+        }
+
+        for fb in all_feedback:
+            # By Status
+            status = fb.status.value
+            stats["by_status"][status] = stats["by_status"].get(status, 0) + 1
+
+            # By Type
+            fb_type = fb.feedback_type.value
+            stats["by_type"][fb_type] = stats["by_type"].get(fb_type, 0) + 1
+
+            # By Priority
+            priority = fb.priority.value
+            stats["by_priority"][priority] = stats["by_priority"].get(priority, 0) + 1
+
+        return stats
+
+    def to_dict(self, db_feedback: TeacherFeedbackDB) -> Dict[str, Any]:
+        """Konvertiert DB-Model zu Dictionary."""
+        return {
+            "id": db_feedback.id,
+            "teacher_id": db_feedback.teacher_id,
+            "teacher_name": db_feedback.teacher_name,
+            "teacher_email": db_feedback.teacher_email,
+            "title": db_feedback.title,
+            "description": db_feedback.description,
+            "feedback_type": db_feedback.feedback_type.value,
+            "priority": db_feedback.priority.value,
+            "status": db_feedback.status.value,
+            "related_feature": db_feedback.related_feature,
+            "context_url": db_feedback.context_url,
+            "context_phase": db_feedback.context_phase,
+            "context_session_id": db_feedback.context_session_id,
+            "user_agent": db_feedback.user_agent,
+            "response": db_feedback.response,
+            "responded_at": db_feedback.responded_at.isoformat() if db_feedback.responded_at else None,
+            "responded_by": db_feedback.responded_by,
+            "created_at": db_feedback.created_at.isoformat() if db_feedback.created_at else None,
+            "updated_at": db_feedback.updated_at.isoformat() if db_feedback.updated_at else None,
+        }
diff --git a/backend-lehrer/classroom_engine/repository_homework.py b/backend-lehrer/classroom_engine/repository_homework.py
new file mode 100644
index 0000000..25e07b9
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_homework.py
@@ -0,0 +1,382 @@
+"""
+Homework & Material Repositories.
+
+CRUD-Operationen fuer Hausaufgaben (Feature f20) und Phasen-Materialien (Feature f19).
+"""
+from datetime import datetime
+from typing import Optional, List
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import (
+    HomeworkDB, HomeworkStatusEnum, PhaseMaterialDB, MaterialTypeEnum,
+)
+from .models import (
+    Homework, HomeworkStatus, PhaseMaterial, MaterialType,
+)
+
+
+class HomeworkRepository:
+    """Repository fuer Hausaufgaben-Tracking (Feature f20)."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    # ==================== CREATE ====================
+
+    def create(self, homework: Homework) -> HomeworkDB:
+        """Erstellt eine neue Hausaufgabe."""
+        db_homework = HomeworkDB(
+            id=homework.homework_id,
+            teacher_id=homework.teacher_id,
+            class_id=homework.class_id,
+            subject=homework.subject,
+            title=homework.title,
+            description=homework.description,
+            session_id=homework.session_id,
+            due_date=homework.due_date,
+            status=HomeworkStatusEnum(homework.status.value),
+        )
+        self.db.add(db_homework)
+        self.db.commit()
+        self.db.refresh(db_homework)
+        return db_homework
+
+    # ==================== READ ====================
+
+    def get_by_id(self, homework_id: str) -> Optional[HomeworkDB]:
+        """Holt eine Hausaufgabe nach ID."""
+        return self.db.query(HomeworkDB).filter(
+            HomeworkDB.id == homework_id
+        ).first()
+
+    def get_by_teacher(
+        self,
+        teacher_id: str,
+        status: Optional[str] = None,
+        limit: int = 50
+    ) -> List[HomeworkDB]:
+        """Holt alle Hausaufgaben eines Lehrers."""
+        query = self.db.query(HomeworkDB).filter(
+            HomeworkDB.teacher_id == teacher_id
+        )
+        if status:
+            query = query.filter(HomeworkDB.status == HomeworkStatusEnum(status))
+        return query.order_by(
+            HomeworkDB.due_date.asc().nullslast(),
+            HomeworkDB.created_at.desc()
+        ).limit(limit).all()
+
+    def get_by_class(
+        self,
+        class_id: str,
+        teacher_id: str,
+        include_completed: bool = False,
+        limit: int = 20
+    ) -> List[HomeworkDB]:
+        """Holt alle Hausaufgaben einer Klasse."""
+        query = self.db.query(HomeworkDB).filter(
+            HomeworkDB.class_id == class_id,
+            HomeworkDB.teacher_id == teacher_id
+        )
+        if not include_completed:
+            query = query.filter(HomeworkDB.status != HomeworkStatusEnum.COMPLETED)
+        return query.order_by(
+            HomeworkDB.due_date.asc().nullslast(),
+            HomeworkDB.created_at.desc()
+        ).limit(limit).all()
+
+    def get_by_session(self, session_id: str) -> List[HomeworkDB]:
+        """Holt alle Hausaufgaben einer Session."""
+        return self.db.query(HomeworkDB).filter(
+            HomeworkDB.session_id == session_id
+        ).order_by(HomeworkDB.created_at.desc()).all()
+
+    def get_pending(
+        self,
+        teacher_id: str,
+        days_ahead: int = 7
+    ) -> List[HomeworkDB]:
+        """Holt anstehende Hausaufgaben der naechsten X Tage."""
+        from datetime import timedelta
+        cutoff = datetime.utcnow() + timedelta(days=days_ahead)
+        return self.db.query(HomeworkDB).filter(
+            HomeworkDB.teacher_id == teacher_id,
+            HomeworkDB.status.in_([HomeworkStatusEnum.ASSIGNED, HomeworkStatusEnum.IN_PROGRESS]),
+            HomeworkDB.due_date <= cutoff
+        ).order_by(HomeworkDB.due_date.asc()).all()
+
+    # ==================== UPDATE ====================
+
+    def update_status(
+        self,
+        homework_id: str,
+        status: HomeworkStatus
+    ) -> Optional[HomeworkDB]:
+        """Aktualisiert den Status einer Hausaufgabe."""
+        db_homework = self.get_by_id(homework_id)
+        if not db_homework:
+            return None
+
+        db_homework.status = HomeworkStatusEnum(status.value)
+        self.db.commit()
+        self.db.refresh(db_homework)
+        return db_homework
+
+    def update(self, homework: Homework) -> Optional[HomeworkDB]:
+        """Aktualisiert eine Hausaufgabe."""
+        db_homework = self.get_by_id(homework.homework_id)
+        if not db_homework:
+            return None
+
+        db_homework.title = homework.title
+        db_homework.description = homework.description
+        db_homework.due_date = homework.due_date
+        db_homework.status = HomeworkStatusEnum(homework.status.value)
+
+        self.db.commit()
+        self.db.refresh(db_homework)
+        return db_homework
+
+    # ==================== DELETE ====================
+
+    def delete(self, homework_id: str) -> bool:
+        """Loescht eine Hausaufgabe."""
+        db_homework = self.get_by_id(homework_id)
+        if not db_homework:
+            return False
+
+        self.db.delete(db_homework)
+        self.db.commit()
+        return True
+
+    # ==================== CONVERSION ====================
+
+    def to_dataclass(self, db_homework: HomeworkDB) -> Homework:
+        """Konvertiert DB-Model zu Dataclass."""
+        return Homework(
+            homework_id=db_homework.id,
+            teacher_id=db_homework.teacher_id,
+            class_id=db_homework.class_id,
+            subject=db_homework.subject,
+            title=db_homework.title,
+            description=db_homework.description or "",
+            session_id=db_homework.session_id,
+            due_date=db_homework.due_date,
+            status=HomeworkStatus(db_homework.status.value),
+            created_at=db_homework.created_at,
+            updated_at=db_homework.updated_at,
+        )
+
+
+class MaterialRepository:
+    """Repository fuer Phasen-Materialien (Feature f19)."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    # ==================== CREATE ====================
+
+    def create(self, material: PhaseMaterial) -> PhaseMaterialDB:
+        """Erstellt ein neues Material."""
+        db_material = PhaseMaterialDB(
+            id=material.material_id,
+            teacher_id=material.teacher_id,
+            title=material.title,
+            material_type=MaterialTypeEnum(material.material_type.value),
+            url=material.url,
+            description=material.description,
+            phase=material.phase,
+            subject=material.subject,
+            grade_level=material.grade_level,
+            tags=material.tags,
+            is_public=material.is_public,
+            usage_count=material.usage_count,
+            session_id=material.session_id,
+        )
+        self.db.add(db_material)
+        self.db.commit()
+        self.db.refresh(db_material)
+        return db_material
+
+    # ==================== READ ====================
+
+    def get_by_id(self, material_id: str) -> Optional[PhaseMaterialDB]:
+        """Holt ein Material nach ID."""
+        return self.db.query(PhaseMaterialDB).filter(
+            PhaseMaterialDB.id == material_id
+        ).first()
+
+    def get_by_teacher(
+        self,
+        teacher_id: str,
+        phase: Optional[str] = None,
+        subject: Optional[str] = None,
+        limit: int = 50
+    ) -> List[PhaseMaterialDB]:
+        """Holt alle Materialien eines Lehrers."""
+        query = self.db.query(PhaseMaterialDB).filter(
+            PhaseMaterialDB.teacher_id == teacher_id
+        )
+        if phase:
+            query = query.filter(PhaseMaterialDB.phase == phase)
+        if subject:
+            query = query.filter(PhaseMaterialDB.subject == subject)
+
+        return query.order_by(
+            PhaseMaterialDB.usage_count.desc(),
+            PhaseMaterialDB.created_at.desc()
+        ).limit(limit).all()
+
+    def get_by_phase(
+        self,
+        phase: str,
+        teacher_id: str,
+        include_public: bool = True
+    ) -> List[PhaseMaterialDB]:
+        """Holt alle Materialien fuer eine bestimmte Phase."""
+        if include_public:
+            return self.db.query(PhaseMaterialDB).filter(
+                PhaseMaterialDB.phase == phase,
+                (PhaseMaterialDB.teacher_id == teacher_id) |
+                (PhaseMaterialDB.is_public == True)
+            ).order_by(
+                PhaseMaterialDB.usage_count.desc()
+            ).all()
+        else:
+            return self.db.query(PhaseMaterialDB).filter(
+                PhaseMaterialDB.phase == phase,
+                PhaseMaterialDB.teacher_id == teacher_id
+            ).order_by(
+                PhaseMaterialDB.created_at.desc()
+            ).all()
+
+    def get_by_session(self, session_id: str) -> List[PhaseMaterialDB]:
+        """Holt alle Materialien einer Session."""
+        return self.db.query(PhaseMaterialDB).filter(
+            PhaseMaterialDB.session_id == session_id
+        ).order_by(PhaseMaterialDB.phase, PhaseMaterialDB.created_at).all()
+
+    def get_public_materials(
+        self,
+        phase: Optional[str] = None,
+        subject: Optional[str] = None,
+        limit: int = 20
+    ) -> List[PhaseMaterialDB]:
+        """Holt oeffentliche Materialien."""
+        query = self.db.query(PhaseMaterialDB).filter(
+            PhaseMaterialDB.is_public == True
+        )
+        if phase:
+            query = query.filter(PhaseMaterialDB.phase == phase)
+        if subject:
+            query = query.filter(PhaseMaterialDB.subject == subject)
+
+        return query.order_by(
+            PhaseMaterialDB.usage_count.desc()
+        ).limit(limit).all()
+
+    def search_by_tags(
+        self,
+        tags: List[str],
+        teacher_id: Optional[str] = None
+    ) -> List[PhaseMaterialDB]:
+        """Sucht Materialien nach Tags."""
+        query = self.db.query(PhaseMaterialDB)
+        if teacher_id:
+            query = query.filter(
+                (PhaseMaterialDB.teacher_id == teacher_id) |
+                (PhaseMaterialDB.is_public == True)
+            )
+        else:
+            query = query.filter(PhaseMaterialDB.is_public == True)
+
+        # Filter by tags - vereinfachte Implementierung
+        results = []
+        for material in query.all():
+            if material.tags and any(tag in material.tags for tag in tags):
+                results.append(material)
+        return results[:50]
+
+    # ==================== UPDATE ====================
+
+    def update(self, material: PhaseMaterial) -> Optional[PhaseMaterialDB]:
+        """Aktualisiert ein Material."""
+        db_material = self.get_by_id(material.material_id)
+        if not db_material:
+            return None
+
+        db_material.title = material.title
+        db_material.material_type = MaterialTypeEnum(material.material_type.value)
+        db_material.url = material.url
+        db_material.description = material.description
+        db_material.phase = material.phase
+        db_material.subject = material.subject
+        db_material.grade_level = material.grade_level
+        db_material.tags = material.tags
+        db_material.is_public = material.is_public
+
+        self.db.commit()
+        self.db.refresh(db_material)
+        return db_material
+
+    def increment_usage(self, material_id: str) -> Optional[PhaseMaterialDB]:
+        """Erhoeht den Usage-Counter eines Materials."""
+        db_material = self.get_by_id(material_id)
+        if not db_material:
+            return None
+
+        db_material.usage_count += 1
+        self.db.commit()
+        self.db.refresh(db_material)
+        return db_material
+
+    def attach_to_session(
+        self,
+        material_id: str,
+        session_id: str
+    ) -> Optional[PhaseMaterialDB]:
+        """Verknuepft ein Material mit einer Session."""
+        db_material = self.get_by_id(material_id)
+        if not db_material:
+            return None
+
+        db_material.session_id = session_id
+        db_material.usage_count += 1
+        self.db.commit()
+        self.db.refresh(db_material)
+        return db_material
+
+    # ==================== DELETE ====================
+
+    def delete(self, material_id: str) -> bool:
+        """Loescht ein Material."""
+        db_material = self.get_by_id(material_id)
+        if not db_material:
+            return False
+
+        self.db.delete(db_material)
+        self.db.commit()
+        return True
+
+    # ==================== CONVERSION ====================
+
+    def to_dataclass(self, db_material: PhaseMaterialDB) -> PhaseMaterial:
+        """Konvertiert DB-Model zu Dataclass."""
+        return PhaseMaterial(
+            material_id=db_material.id,
+            teacher_id=db_material.teacher_id,
+            title=db_material.title,
+            material_type=MaterialType(db_material.material_type.value),
+            url=db_material.url,
+            description=db_material.description or "",
+            phase=db_material.phase,
+            subject=db_material.subject or "",
+            grade_level=db_material.grade_level or "",
+            tags=db_material.tags or [],
+            is_public=db_material.is_public,
+            usage_count=db_material.usage_count,
+            session_id=db_material.session_id,
+            created_at=db_material.created_at,
+            updated_at=db_material.updated_at,
+        )
diff --git a/backend-lehrer/classroom_engine/repository_reflection.py b/backend-lehrer/classroom_engine/repository_reflection.py
new file mode 100644
index 0000000..159fb5f
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_reflection.py
@@ -0,0 +1,315 @@
+"""
+Reflection & Analytics Repositories.
+
+CRUD-Operationen fuer Lesson-Reflections und Analytics-Abfragen (Phase 5).
+"""
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import LessonSessionDB, LessonPhaseEnum, LessonReflectionDB
+from .analytics import (
+    LessonReflection, SessionSummary, TeacherAnalytics, AnalyticsCalculator,
+)
+
+
+class ReflectionRepository:
+    """Repository fuer LessonReflection CRUD-Operationen."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    # ==================== CREATE ====================
+
+    def create(self, reflection: LessonReflection) -> LessonReflectionDB:
+        """Erstellt eine neue Reflection."""
+        db_reflection = LessonReflectionDB(
+            id=reflection.reflection_id,
+            session_id=reflection.session_id,
+            teacher_id=reflection.teacher_id,
+            notes=reflection.notes,
+            overall_rating=reflection.overall_rating,
+            what_worked=reflection.what_worked,
+            improvements=reflection.improvements,
+            notes_for_next_lesson=reflection.notes_for_next_lesson,
+        )
+        self.db.add(db_reflection)
+        self.db.commit()
+        self.db.refresh(db_reflection)
+        return db_reflection
+
+    # ==================== READ ====================
+
+    def get_by_id(self, reflection_id: str) -> Optional[LessonReflectionDB]:
+        """Holt eine Reflection nach ID."""
+        return self.db.query(LessonReflectionDB).filter(
+            LessonReflectionDB.id == reflection_id
+        ).first()
+
+    def get_by_session(self, session_id: str) -> Optional[LessonReflectionDB]:
+        """Holt die Reflection einer Session."""
+        return self.db.query(LessonReflectionDB).filter(
+            LessonReflectionDB.session_id == session_id
+        ).first()
+
+    def get_by_teacher(
+        self,
+        teacher_id: str,
+        limit: int = 20,
+        offset: int = 0
+    ) -> List[LessonReflectionDB]:
+        """Holt alle Reflections eines Lehrers."""
+        return self.db.query(LessonReflectionDB).filter(
+            LessonReflectionDB.teacher_id == teacher_id
+        ).order_by(
+            LessonReflectionDB.created_at.desc()
+        ).offset(offset).limit(limit).all()
+
+    # ==================== UPDATE ====================
+
+    def update(self, reflection: LessonReflection) -> Optional[LessonReflectionDB]:
+        """Aktualisiert eine Reflection."""
+        db_reflection = self.get_by_id(reflection.reflection_id)
+        if not db_reflection:
+            return None
+
+        db_reflection.notes = reflection.notes
+        db_reflection.overall_rating = reflection.overall_rating
+        db_reflection.what_worked = reflection.what_worked
+        db_reflection.improvements = reflection.improvements
+        db_reflection.notes_for_next_lesson = reflection.notes_for_next_lesson
+
+        self.db.commit()
+        self.db.refresh(db_reflection)
+        return db_reflection
+
+    # ==================== DELETE ====================
+
+    def delete(self, reflection_id: str) -> bool:
+        """Loescht eine Reflection."""
+        db_reflection = self.get_by_id(reflection_id)
+        if not db_reflection:
+            return False
+
+        self.db.delete(db_reflection)
+        self.db.commit()
+        return True
+
+    # ==================== CONVERSION ====================
+
+    def to_dataclass(self, db_reflection: LessonReflectionDB) -> LessonReflection:
+        """Konvertiert DB-Model zu Dataclass."""
+        return LessonReflection(
+            reflection_id=db_reflection.id,
+            session_id=db_reflection.session_id,
+            teacher_id=db_reflection.teacher_id,
+            notes=db_reflection.notes or "",
+            overall_rating=db_reflection.overall_rating,
+            what_worked=db_reflection.what_worked or [],
+            improvements=db_reflection.improvements or [],
+            notes_for_next_lesson=db_reflection.notes_for_next_lesson or "",
+            created_at=db_reflection.created_at,
+            updated_at=db_reflection.updated_at,
+        )
+
+
+class AnalyticsRepository:
+    """Repository fuer Analytics-Abfragen."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    def get_session_summary(self, session_id: str) -> Optional[SessionSummary]:
+        """
+        Berechnet die Summary einer abgeschlossenen Session.
+
+        Args:
+            session_id: ID der Session
+
+        Returns:
+            SessionSummary oder None wenn Session nicht gefunden
+        """
+        db_session = self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.id == session_id
+        ).first()
+
+        if not db_session:
+            return None
+
+        # Session-Daten zusammenstellen
+        session_data = {
+            "session_id": db_session.id,
+            "teacher_id": db_session.teacher_id,
+            "class_id": db_session.class_id,
+            "subject": db_session.subject,
+            "topic": db_session.topic,
+            "lesson_started_at": db_session.lesson_started_at,
+            "lesson_ended_at": db_session.lesson_ended_at,
+            "phase_durations": db_session.phase_durations or {},
+        }
+
+        # Phase History aus DB oder JSON
+        phase_history = db_session.phase_history or []
+
+        # Summary berechnen
+        return AnalyticsCalculator.calculate_session_summary(
+            session_data, phase_history
+        )
+
+    def get_teacher_analytics(
+        self,
+        teacher_id: str,
+        period_start: Optional[datetime] = None,
+        period_end: Optional[datetime] = None
+    ) -> TeacherAnalytics:
+        """
+        Berechnet aggregierte Statistiken fuer einen Lehrer.
+
+        Args:
+            teacher_id: ID des Lehrers
+            period_start: Beginn des Zeitraums (default: 30 Tage zurueck)
+            period_end: Ende des Zeitraums (default: jetzt)
+
+        Returns:
+            TeacherAnalytics mit aggregierten Statistiken
+        """
+        from datetime import timedelta
+
+        if not period_end:
+            period_end = datetime.utcnow()
+        if not period_start:
+            period_start = period_end - timedelta(days=30)
+
+        # Sessions im Zeitraum abfragen
+        sessions_query = self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.teacher_id == teacher_id,
+            LessonSessionDB.lesson_started_at >= period_start,
+            LessonSessionDB.lesson_started_at <= period_end
+        ).all()
+
+        # Sessions zu Dictionaries konvertieren
+        sessions_data = []
+        for db_session in sessions_query:
+            sessions_data.append({
+                "session_id": db_session.id,
+                "teacher_id": db_session.teacher_id,
+                "class_id": db_session.class_id,
+                "subject": db_session.subject,
+                "topic": db_session.topic,
+                "lesson_started_at": db_session.lesson_started_at,
+                "lesson_ended_at": db_session.lesson_ended_at,
+                "phase_durations": db_session.phase_durations or {},
+                "phase_history": db_session.phase_history or [],
+            })
+
+        return AnalyticsCalculator.calculate_teacher_analytics(
+            sessions_data, period_start, period_end
+        )
+
+    def get_phase_duration_trends(
+        self,
+        teacher_id: str,
+        phase: str,
+        limit: int = 20
+    ) -> List[Dict[str, Any]]:
+        """
+        Gibt die Dauer-Trends fuer eine bestimmte Phase zurueck.
+
+        Args:
+            teacher_id: ID des Lehrers
+            phase: Phasen-ID (einstieg, erarbeitung, etc.)
+            limit: Max Anzahl der Datenpunkte
+
+        Returns:
+            Liste von Datenpunkten [{date, planned, actual, difference}]
+        """
+        sessions = self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.teacher_id == teacher_id,
+            LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
+        ).order_by(
+            LessonSessionDB.lesson_ended_at.desc()
+        ).limit(limit).all()
+
+        trends = []
+        for db_session in sessions:
+            history = db_session.phase_history or []
+            for entry in history:
+                if entry.get("phase") == phase:
+                    planned = (db_session.phase_durations or {}).get(phase, 0) * 60
+                    actual = entry.get("duration_seconds", 0) or 0
+                    trends.append({
+                        "date": db_session.lesson_started_at.isoformat() if db_session.lesson_started_at else None,
+                        "session_id": db_session.id,
+                        "subject": db_session.subject,
+                        "planned_seconds": planned,
+                        "actual_seconds": actual,
+                        "difference_seconds": actual - planned,
+                    })
+                    break
+
+        return list(reversed(trends))  # Chronologisch sortieren
+
+    def get_overtime_analysis(
+        self,
+        teacher_id: str,
+        limit: int = 30
+    ) -> Dict[str, Any]:
+        """
+        Analysiert Overtime-Muster.
+
+        Args:
+            teacher_id: ID des Lehrers
+            limit: Anzahl der zu analysierenden Sessions
+
+        Returns:
+            Dict mit Overtime-Statistiken pro Phase
+        """
+        sessions = self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.teacher_id == teacher_id,
+            LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
+        ).order_by(
+            LessonSessionDB.lesson_ended_at.desc()
+        ).limit(limit).all()
+
+        phase_overtime: Dict[str, List[int]] = {
+            "einstieg": [],
+            "erarbeitung": [],
+            "sicherung": [],
+            "transfer": [],
+            "reflexion": [],
+        }
+
+        for db_session in sessions:
+            history = db_session.phase_history or []
+            phase_durations = db_session.phase_durations or {}
+
+            for entry in history:
+                phase = entry.get("phase", "")
+                if phase in phase_overtime:
+                    planned = phase_durations.get(phase, 0) * 60
+                    actual = entry.get("duration_seconds", 0) or 0
+                    overtime = max(0, actual - planned)
+                    phase_overtime[phase].append(overtime)
+
+        # Statistiken berechnen
+        result = {}
+        for phase, overtimes in phase_overtime.items():
+            if overtimes:
+                result[phase] = {
+                    "count": len([o for o in overtimes if o > 0]),
+                    "total": len(overtimes),
+                    "avg_overtime_seconds": sum(overtimes) / len(overtimes),
+                    "max_overtime_seconds": max(overtimes),
+                    "overtime_percentage": len([o for o in overtimes if o > 0]) / len(overtimes) * 100,
+                }
+            else:
+                result[phase] = {
+                    "count": 0,
+                    "total": 0,
+                    "avg_overtime_seconds": 0,
+                    "max_overtime_seconds": 0,
+                    "overtime_percentage": 0,
+                }
+
+        return result
diff --git a/backend-lehrer/classroom_engine/repository_session.py b/backend-lehrer/classroom_engine/repository_session.py
new file mode 100644
index 0000000..1165d33
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_session.py
@@ -0,0 +1,248 @@
+"""
+Session & Teacher Settings Repositories.
+
+CRUD-Operationen fuer LessonSessions und Lehrer-Einstellungen.
+"""
+from typing import Optional, List, Dict
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import (
+    LessonSessionDB, LessonPhaseEnum, TeacherSettingsDB,
+)
+from .models import (
+    LessonSession, LessonPhase, get_default_durations,
+)
+
+
+class SessionRepository:
+    """Repository fuer LessonSession CRUD-Operationen."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    # ==================== CREATE ====================
+
+    def create(self, session: LessonSession) -> LessonSessionDB:
+        """
+        Erstellt eine neue Session in der Datenbank.
+
+        Args:
+            session: LessonSession Dataclass
+
+        Returns:
+            LessonSessionDB Model
+        """
+        db_session = LessonSessionDB(
+            id=session.session_id,
+            teacher_id=session.teacher_id,
+            class_id=session.class_id,
+            subject=session.subject,
+            topic=session.topic,
+            current_phase=LessonPhaseEnum(session.current_phase.value),
+            is_paused=session.is_paused,
+            lesson_started_at=session.lesson_started_at,
+            lesson_ended_at=session.lesson_ended_at,
+            phase_started_at=session.phase_started_at,
+            pause_started_at=session.pause_started_at,
+            total_paused_seconds=session.total_paused_seconds,
+            phase_durations=session.phase_durations,
+            phase_history=session.phase_history,
+            notes=session.notes,
+            homework=session.homework,
+        )
+        self.db.add(db_session)
+        self.db.commit()
+        self.db.refresh(db_session)
+        return db_session
+
+    # ==================== READ ====================
+
+    def get_by_id(self, session_id: str) -> Optional[LessonSessionDB]:
+        """Holt eine Session nach ID."""
+        return self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.id == session_id
+        ).first()
+
+    def get_active_by_teacher(self, teacher_id: str) -> List[LessonSessionDB]:
+        """Holt alle aktiven Sessions eines Lehrers."""
+        return self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.teacher_id == teacher_id,
+            LessonSessionDB.current_phase != LessonPhaseEnum.ENDED
+        ).all()
+
+    def get_history_by_teacher(
+        self,
+        teacher_id: str,
+        limit: int = 20,
+        offset: int = 0
+    ) -> List[LessonSessionDB]:
+        """Holt Session-History eines Lehrers (Feature f17)."""
+        return self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.teacher_id == teacher_id,
+            LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
+        ).order_by(
+            LessonSessionDB.lesson_ended_at.desc()
+        ).offset(offset).limit(limit).all()
+
+    def get_by_class(
+        self,
+        class_id: str,
+        limit: int = 20
+    ) -> List[LessonSessionDB]:
+        """Holt Sessions einer Klasse."""
+        return self.db.query(LessonSessionDB).filter(
+            LessonSessionDB.class_id == class_id
+        ).order_by(
+            LessonSessionDB.created_at.desc()
+        ).limit(limit).all()
+
+    # ==================== UPDATE ====================
+
+    def update(self, session: LessonSession) -> Optional[LessonSessionDB]:
+        """
+        Aktualisiert eine bestehende Session.
+
+        Args:
+            session: LessonSession Dataclass mit aktualisierten Werten
+
+        Returns:
+            Aktualisierte LessonSessionDB oder None
+        """
+        db_session = self.get_by_id(session.session_id)
+        if not db_session:
+            return None
+
+        db_session.current_phase = LessonPhaseEnum(session.current_phase.value)
+        db_session.is_paused = session.is_paused
+        db_session.lesson_started_at = session.lesson_started_at
+        db_session.lesson_ended_at = session.lesson_ended_at
+        db_session.phase_started_at = session.phase_started_at
+        db_session.pause_started_at = session.pause_started_at
+        db_session.total_paused_seconds = session.total_paused_seconds
+        db_session.phase_durations = session.phase_durations
+        db_session.phase_history = session.phase_history
+        db_session.notes = session.notes
+        db_session.homework = session.homework
+
+        self.db.commit()
+        self.db.refresh(db_session)
+        return db_session
+
+    def update_notes(
+        self,
+        session_id: str,
+        notes: str,
+        homework: str
+    ) -> Optional[LessonSessionDB]:
+        """Aktualisiert nur Notizen und Hausaufgaben."""
+        db_session = self.get_by_id(session_id)
+        if not db_session:
+            return None
+
+        db_session.notes = notes
+        db_session.homework = homework
+
+        self.db.commit()
+        self.db.refresh(db_session)
+        return db_session
+
+    # ==================== DELETE ====================
+
+    def delete(self, session_id: str) -> bool:
+        """Loescht eine Session."""
+        db_session = self.get_by_id(session_id)
+        if not db_session:
+            return False
+
+        self.db.delete(db_session)
+        self.db.commit()
+        return True
+
+    # ==================== CONVERSION ====================
+
+    def to_dataclass(self, db_session: LessonSessionDB) -> LessonSession:
+        """
+        Konvertiert DB-Model zu Dataclass.
+
+        Args:
+            db_session: LessonSessionDB Model
+
+        Returns:
+            LessonSession Dataclass
+        """
+        return LessonSession(
+            session_id=db_session.id,
+            teacher_id=db_session.teacher_id,
+            class_id=db_session.class_id,
+            subject=db_session.subject,
+            topic=db_session.topic,
+            current_phase=LessonPhase(db_session.current_phase.value),
+            phase_started_at=db_session.phase_started_at,
+            lesson_started_at=db_session.lesson_started_at,
+            lesson_ended_at=db_session.lesson_ended_at,
+            is_paused=db_session.is_paused,
+            pause_started_at=db_session.pause_started_at,
+            total_paused_seconds=db_session.total_paused_seconds or 0,
+            phase_durations=db_session.phase_durations or get_default_durations(),
+            phase_history=db_session.phase_history or [],
+            notes=db_session.notes or "",
+            homework=db_session.homework or "",
+        )
+
+
+class TeacherSettingsRepository:
+    """Repository fuer Lehrer-Einstellungen (Feature f16)."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    def get_or_create(self, teacher_id: str) -> TeacherSettingsDB:
+        """Holt oder erstellt Einstellungen fuer einen Lehrer."""
+        settings = self.db.query(TeacherSettingsDB).filter(
+            TeacherSettingsDB.teacher_id == teacher_id
+        ).first()
+
+        if not settings:
+            settings = TeacherSettingsDB(
+                teacher_id=teacher_id,
+                default_phase_durations=get_default_durations(),
+            )
+            self.db.add(settings)
+            self.db.commit()
+            self.db.refresh(settings)
+
+        return settings
+
+    def update_phase_durations(
+        self,
+        teacher_id: str,
+        durations: Dict[str, int]
+    ) -> TeacherSettingsDB:
+        """Aktualisiert die Standard-Phasendauern."""
+        settings = self.get_or_create(teacher_id)
+        settings.default_phase_durations = durations
+        self.db.commit()
+        self.db.refresh(settings)
+        return settings
+
+    def update_preferences(
+        self,
+        teacher_id: str,
+        audio_enabled: Optional[bool] = None,
+        high_contrast: Optional[bool] = None,
+        show_statistics: Optional[bool] = None
+    ) -> TeacherSettingsDB:
+        """Aktualisiert UI-Praeferenzen."""
+        settings = self.get_or_create(teacher_id)
+
+        if audio_enabled is not None:
+            settings.audio_enabled = audio_enabled
+        if high_contrast is not None:
+            settings.high_contrast = high_contrast
+        if show_statistics is not None:
+            settings.show_statistics = show_statistics
+
+        self.db.commit()
+        self.db.refresh(settings)
+        return settings
diff --git a/backend-lehrer/classroom_engine/repository_template.py b/backend-lehrer/classroom_engine/repository_template.py
new file mode 100644
index 0000000..e97c16d
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_template.py
@@ -0,0 +1,167 @@
+"""
+Template Repository.
+
+CRUD-Operationen fuer Stunden-Vorlagen (Feature f37).
+"""
+from typing import Optional, List
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import LessonTemplateDB
+from .models import LessonTemplate, get_default_durations
+
+
+class TemplateRepository:
+    """Repository fuer Stunden-Vorlagen (Feature f37)."""
+
+    def __init__(self, db: DBSession):
+        self.db = db
+
+    # ==================== CREATE ====================
+
+    def create(self, template: LessonTemplate) -> LessonTemplateDB:
+        """Erstellt eine neue Vorlage."""
+        db_template = LessonTemplateDB(
+            id=template.template_id,
+            teacher_id=template.teacher_id,
+            name=template.name,
+            description=template.description,
+            subject=template.subject,
+            grade_level=template.grade_level,
+            phase_durations=template.phase_durations,
+            default_topic=template.default_topic,
+            default_notes=template.default_notes,
+            is_public=template.is_public,
+            usage_count=template.usage_count,
+        )
+        self.db.add(db_template)
+        self.db.commit()
+        self.db.refresh(db_template)
+        return db_template
+
+    # ==================== READ ====================
+
+    def get_by_id(self, template_id: str) -> Optional[LessonTemplateDB]:
+        """Holt eine Vorlage nach ID."""
+        return self.db.query(LessonTemplateDB).filter(
+            LessonTemplateDB.id == template_id
+        ).first()
+
+    def get_by_teacher(
+        self,
+        teacher_id: str,
+        include_public: bool = True
+    ) -> List[LessonTemplateDB]:
+        """
+        Holt alle Vorlagen eines Lehrers.
+
+        Args:
+            teacher_id: ID des Lehrers
+            include_public: Auch oeffentliche Vorlagen anderer Lehrer einbeziehen
+        """
+        if include_public:
+            return self.db.query(LessonTemplateDB).filter(
+                (LessonTemplateDB.teacher_id == teacher_id) |
+                (LessonTemplateDB.is_public == True)
+            ).order_by(
+                LessonTemplateDB.usage_count.desc()
+            ).all()
+        else:
+            return self.db.query(LessonTemplateDB).filter(
+                LessonTemplateDB.teacher_id == teacher_id
+            ).order_by(
+                LessonTemplateDB.created_at.desc()
+            ).all()
+
+    def get_public_templates(self, limit: int = 20) -> List[LessonTemplateDB]:
+        """Holt oeffentliche Vorlagen, sortiert nach Beliebtheit."""
+        return self.db.query(LessonTemplateDB).filter(
+            LessonTemplateDB.is_public == True
+        ).order_by(
+            LessonTemplateDB.usage_count.desc()
+        ).limit(limit).all()
+
+    def get_by_subject(
+        self,
+        subject: str,
+        teacher_id: Optional[str] = None
+    ) -> List[LessonTemplateDB]:
+        """Holt Vorlagen fuer ein bestimmtes Fach."""
+        query = self.db.query(LessonTemplateDB).filter(
+            LessonTemplateDB.subject == subject
+        )
+        if teacher_id:
+            query = query.filter(
+                (LessonTemplateDB.teacher_id == teacher_id) |
+                (LessonTemplateDB.is_public == True)
+            )
+        else:
+            query = query.filter(LessonTemplateDB.is_public == True)
+
+        return query.order_by(
+            LessonTemplateDB.usage_count.desc()
+        ).all()
+
+    # ==================== UPDATE ====================
+
+    def update(self, template: LessonTemplate) -> Optional[LessonTemplateDB]:
+        """Aktualisiert eine Vorlage."""
+        db_template = self.get_by_id(template.template_id)
+        if not db_template:
+            return None
+
+        db_template.name = template.name
+        db_template.description = template.description
+        db_template.subject = template.subject
+        db_template.grade_level = template.grade_level
+        db_template.phase_durations = template.phase_durations
+        db_template.default_topic = template.default_topic
+        db_template.default_notes = template.default_notes
+        db_template.is_public = template.is_public
+
+        self.db.commit()
+        self.db.refresh(db_template)
+        return db_template
+
+    def increment_usage(self, template_id: str) -> Optional[LessonTemplateDB]:
+        """Erhoeht den Usage-Counter einer Vorlage."""
+        db_template = self.get_by_id(template_id)
+        if not db_template:
+            return None
+
+        db_template.usage_count += 1
+        self.db.commit()
+        self.db.refresh(db_template)
+        return db_template
+
+    # ==================== DELETE ====================
+
+    def delete(self, template_id: str) -> bool:
+        """Loescht eine Vorlage."""
+        db_template = self.get_by_id(template_id)
+        if not db_template:
+            return False
+
+        self.db.delete(db_template)
+        self.db.commit()
+        return True
+
+    # ==================== CONVERSION ====================
+
+    def to_dataclass(self, db_template: LessonTemplateDB) -> LessonTemplate:
+        """Konvertiert DB-Model zu Dataclass."""
+        return LessonTemplate(
+            template_id=db_template.id,
+            teacher_id=db_template.teacher_id,
+            name=db_template.name,
+            description=db_template.description or "",
+            subject=db_template.subject or "",
+            grade_level=db_template.grade_level or "",
+            phase_durations=db_template.phase_durations or get_default_durations(),
+            default_topic=db_template.default_topic or "",
+            default_notes=db_template.default_notes or "",
+            is_public=db_template.is_public,
+            usage_count=db_template.usage_count,
+            created_at=db_template.created_at,
+            updated_at=db_template.updated_at,
+        )
diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py
index 56fd472..466565e 100644
--- a/klausur-service/backend/cv_cell_grid.py
+++ b/klausur-service/backend/cv_cell_grid.py
@@ -1,1675 +1,60 @@
 """
 Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
 
+Re-export hub — all public and private names remain importable from here
+for backward compatibility. The actual implementations live in:
+
+  cv_cell_grid_helpers.py    — shared helpers (_heal_row_gaps, _is_artifact_row, ...)
+  cv_cell_grid_build.py      — v2 hybrid grid (build_cell_grid_v2, _ocr_cell_crop)
+  cv_cell_grid_legacy.py     — deprecated v1 grid (build_cell_grid, _ocr_single_cell)
+  cv_cell_grid_streaming.py  — streaming variants (build_cell_grid_v2_streaming, ...)
+  cv_cell_grid_merge.py      — row-merging logic (_merge_wrapped_rows, ...)
+  cv_cell_grid_vocab.py      — vocab extraction (_cells_to_vocab_entries, build_word_grid)
+
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 
-import logging
-import re
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, Generator, List, Optional, Tuple
-
-import numpy as np
-
-from cv_vocab_types import PageRegion, RowGeometry
-from cv_ocr_engines import (
-    RAPIDOCR_AVAILABLE,
-    _RE_ALPHA,
-    _assign_row_words_to_columns,
-    _attach_example_sentences,
-    _clean_cell_text,
-    _clean_cell_text_lite,
-    _fix_phonetic_brackets,
-    _split_comma_entries,
-    _words_to_reading_order_text,
-    _words_to_spaced_text,
-    ocr_region_lighton,
-    ocr_region_rapid,
-    ocr_region_trocr,
+# --- Helpers ---
+from cv_cell_grid_helpers import (  # noqa: F401
+    _MIN_WORD_CONF,
+    _compute_cell_padding,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
 )
 
-logger = logging.getLogger(__name__)
-
-try:
-    import cv2
-except ImportError:
-    cv2 = None  # type: ignore[assignment]
-
-try:
-    from PIL import Image
-except ImportError:
-    Image = None  # type: ignore[assignment,misc]
-
-# Minimum OCR word confidence to keep (used across multiple functions)
-_MIN_WORD_CONF = 30
-
-# ---------------------------------------------------------------------------
-
-def _ocr_cell_crop(
-    row_idx: int,
-    col_idx: int,
-    row: RowGeometry,
-    col: PageRegion,
-    ocr_img: np.ndarray,
-    img_bgr: Optional[np.ndarray],
-    img_w: int,
-    img_h: int,
-    engine_name: str,
-    lang: str,
-    lang_map: Dict[str, str],
-) -> Dict[str, Any]:
-    """OCR a single cell by cropping the exact column×row intersection.
-
-    No padding beyond cell boundaries → no neighbour bleeding.
-    """
-    # Display bbox: exact column × row intersection
-    disp_x = col.x
-    disp_y = row.y
-    disp_w = col.width
-    disp_h = row.height
-
-    # Crop boundaries: add small internal padding (3px each side) to avoid
-    # clipping characters near column/row edges (e.g. parentheses, descenders).
-    # Stays within image bounds but may extend slightly beyond strict cell.
-    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
-    _PAD = 3
-    cx = max(0, disp_x - _PAD)
-    cy = max(0, disp_y - _PAD)
-    cx2 = min(img_w, disp_x + disp_w + _PAD)
-    cy2 = min(img_h, disp_y + disp_h + _PAD)
-    cw = cx2 - cx
-    ch = cy2 - cy
-
-    empty_cell = {
-        'cell_id': f"R{row_idx:02d}_C{col_idx}",
-        'row_index': row_idx,
-        'col_index': col_idx,
-        'col_type': col.type,
-        'text': '',
-        'confidence': 0.0,
-        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
-        'bbox_pct': {
-            'x': round(disp_x / img_w * 100, 2) if img_w else 0,
-            'y': round(disp_y / img_h * 100, 2) if img_h else 0,
-            'w': round(disp_w / img_w * 100, 2) if img_w else 0,
-            'h': round(disp_h / img_h * 100, 2) if img_h else 0,
-        },
-        'ocr_engine': 'cell_crop_v2',
-        'is_bold': False,
-    }
-
-    if cw <= 0 or ch <= 0:
-        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
-        return empty_cell
-
-    # --- Pixel-density check: skip truly empty cells ---
-    if ocr_img is not None:
-        crop = ocr_img[cy:cy + ch, cx:cx + cw]
-        if crop.size > 0:
-            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
-            if dark_ratio < 0.005:
-                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
-                            row_idx, col_idx, dark_ratio, cw, ch)
-                return empty_cell
-
-    # --- Prepare crop for OCR ---
-    cell_lang = lang_map.get(col.type, lang)
-    psm = _select_psm_for_column(col.type, col.width, row.height)
-    text = ''
-    avg_conf = 0.0
-    used_engine = 'cell_crop_v2'
-
-    if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
-        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
-        words = ocr_region_trocr(img_bgr, cell_region,
-                                 handwritten=(engine_name == "trocr-handwritten"))
-    elif engine_name == "lighton" and img_bgr is not None:
-        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
-        words = ocr_region_lighton(img_bgr, cell_region)
-    elif engine_name == "rapid" and img_bgr is not None:
-        # Upscale small BGR crops for RapidOCR.
-        # Cell crops typically have height 35-55px but width >300px.
-        # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
-        # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
-        # For very short heights (< 80px), force 3× upscale for better OCR
-        # of small characters like periods, ellipsis, and phonetic symbols.
-        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
-        if bgr_crop.size == 0:
-            words = []
-        else:
-            crop_h, crop_w = bgr_crop.shape[:2]
-            if crop_h < 80:
-                # Force 3× upscale for short rows — small chars need more pixels
-                scale = 3.0
-                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
-                                    interpolation=cv2.INTER_CUBIC)
-            else:
-                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
-            up_h, up_w = bgr_up.shape[:2]
-            scale_x = up_w / max(crop_w, 1)
-            scale_y = up_h / max(crop_h, 1)
-            was_scaled = (up_w != crop_w or up_h != crop_h)
-            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
-                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
-            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
-            words = ocr_region_rapid(bgr_up, tmp_region)
-            # Remap positions back to original image coords
-            if words and was_scaled:
-                for w in words:
-                    w['left'] = int(w['left'] / scale_x) + cx
-                    w['top'] = int(w['top'] / scale_y) + cy
-                    w['width'] = int(w['width'] / scale_x)
-                    w['height'] = int(w['height'] / scale_y)
-            elif words:
-                for w in words:
-                    w['left'] += cx
-                    w['top'] += cy
-    else:
-        # Tesseract: upscale tiny crops for better recognition
-        if ocr_img is not None:
-            crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
-            upscaled = _ensure_minimum_crop_size(crop_slice)
-            up_h, up_w = upscaled.shape[:2]
-            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
-            words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
-            # Remap word positions back to original image coordinates
-            if words and (up_w != cw or up_h != ch):
-                sx = cw / max(up_w, 1)
-                sy = ch / max(up_h, 1)
-                for w in words:
-                    w['left'] = int(w['left'] * sx) + cx
-                    w['top'] = int(w['top'] * sy) + cy
-                    w['width'] = int(w['width'] * sx)
-                    w['height'] = int(w['height'] * sy)
-            elif words:
-                for w in words:
-                    w['left'] += cx
-                    w['top'] += cy
-        else:
-            words = []
-
-    # Filter low-confidence words
-    if words:
-        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
-    if words:
-        y_tol = max(15, ch)
-        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
-                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
-    else:
-        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
-                    row_idx, col_idx, cw, ch, psm, engine_name)
-
-    # --- PSM 7 fallback for still-empty Tesseract cells ---
-    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
-        crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
-        upscaled = _ensure_minimum_crop_size(crop_slice)
-        up_h, up_w = upscaled.shape[:2]
-        tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
-        psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
-        if psm7_words:
-            psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-        if psm7_words:
-            p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
-            if p7_text.strip():
-                text = p7_text
-                avg_conf = round(
-                    sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
-                )
-                used_engine = 'cell_crop_v2_psm7'
-                # Remap PSM7 word positions back to original image coords
-                if up_w != cw or up_h != ch:
-                    sx = cw / max(up_w, 1)
-                    sy = ch / max(up_h, 1)
-                    for w in psm7_words:
-                        w['left'] = int(w['left'] * sx) + cx
-                        w['top'] = int(w['top'] * sy) + cy
-                        w['width'] = int(w['width'] * sx)
-                        w['height'] = int(w['height'] * sy)
-                else:
-                    for w in psm7_words:
-                        w['left'] += cx
-                        w['top'] += cy
-                words = psm7_words
-
-    # --- Noise filter ---
-    if text.strip():
-        pre_filter = text
-        text = _clean_cell_text_lite(text)
-        if not text:
-            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
-                        row_idx, col_idx, pre_filter)
-            avg_conf = 0.0
-
-    result = dict(empty_cell)
-    result['text'] = text
-    result['confidence'] = avg_conf
-    result['ocr_engine'] = used_engine
-
-    # Store individual word bounding boxes (absolute image coordinates)
-    # for pixel-accurate overlay positioning in the frontend.
-    if words and text.strip():
-        result['word_boxes'] = [
-            {
-                'text': w.get('text', ''),
-                'left': w['left'],
-                'top': w['top'],
-                'width': w['width'],
-                'height': w['height'],
-                'conf': w.get('conf', 0),
-            }
-            for w in words
-            if w.get('text', '').strip()
-        ]
-
-    return result
-
-
-# Threshold: columns narrower than this (% of image width) use single-cell
-# crop OCR instead of full-page word assignment.
-#
-# Broad columns (>= threshold): Full-page Tesseract word assignment.
-#   Better for multi-word content (sentences, IPA brackets, punctuation).
-#   Examples: EN vocabulary, DE translation, example sentences.
-#
-# Narrow columns (< threshold): Isolated cell-crop OCR.
-#   Prevents neighbour bleeding from adjacent broad columns.
-#   Examples: page_ref, marker, numbering columns.
-#
-# 15% was empirically validated across vocab table scans with 3-5 columns.
-# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
-# The 15% boundary cleanly separates the two groups.
-_NARROW_COL_THRESHOLD_PCT = 15.0
-
-
-def build_cell_grid_v2(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-    skip_heal_gaps: bool = False,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
-
-    Drop-in replacement for build_cell_grid() — same signature & return type.
-
-    Strategy:
-    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
-      words (from row.words). Handles IPA brackets, punctuation, sentence
-      continuity correctly.
-    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
-      neighbour bleeding from adjacent broad columns.
-    """
-    engine_name = "tesseract"
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
-        engine_name = "rapid"
-
-    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
-
-    # Filter to content rows only
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        logger.warning("build_cell_grid_v2: no content rows found")
-        return [], []
-
-    # Filter phantom rows (word_count=0) and artifact rows
-    before = len(content_rows)
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    skipped = before - len(content_rows)
-    if skipped > 0:
-        logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
-    if not content_rows:
-        logger.warning("build_cell_grid_v2: no content rows with words found")
-        return [], []
-
-    before_art = len(content_rows)
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    artifact_skipped = before_art - len(content_rows)
-    if artifact_skipped > 0:
-        logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
-    if not content_rows:
-        logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
-        return [], []
-
-    # Filter columns
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
-                   'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        logger.warning("build_cell_grid_v2: no usable columns found")
-        return [], []
-
-    # Heal row gaps — use header/footer boundaries
-    content_rows.sort(key=lambda r: r.y)
-    header_rows = [r for r in row_geometries if r.row_type == 'header']
-    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
-    if header_rows:
-        top_bound = max(r.y + r.height for r in header_rows)
-    else:
-        top_bound = content_rows[0].y
-    if footer_rows:
-        bottom_bound = min(r.y for r in footer_rows)
-    else:
-        bottom_bound = content_rows[-1].y + content_rows[-1].height
-
-    # skip_heal_gaps: When True, keep cell positions at their exact row geometry
-    # positions without expanding to fill gaps from removed rows.  Useful for
-    # overlay rendering where pixel-precise positioning matters more than
-    # full-coverage OCR crops.
-    if not skip_heal_gaps:
-        _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
-    relevant_cols.sort(key=lambda c: c.x)
-
-    columns_meta = [
-        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
-        for ci, c in enumerate(relevant_cols)
-    ]
-
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    # --- Classify columns as broad vs narrow ---
-    narrow_col_indices = set()
-    for ci, col in enumerate(relevant_cols):
-        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
-        if col_pct < _NARROW_COL_THRESHOLD_PCT:
-            narrow_col_indices.add(ci)
-
-    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
-    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
-                f"{len(narrow_col_indices)} narrow columns (cell-crop)")
-
-    # --- Phase 1: Broad columns via full-page word assignment ---
-    cells: List[Dict[str, Any]] = []
-
-    for row_idx, row in enumerate(content_rows):
-        # Assign full-page words to columns for this row
-        col_words = _assign_row_words_to_columns(row, relevant_cols)
-
-        for col_idx, col in enumerate(relevant_cols):
-            if col_idx not in narrow_col_indices:
-                # BROAD column: use pre-assigned full-page words
-                words = col_words.get(col_idx, [])
-                # Filter low-confidence words
-                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
-                # Single full-width column (box sub-session): preserve spacing
-                is_single_full_column = (
-                    len(relevant_cols) == 1
-                    and img_w > 0
-                    and relevant_cols[0].width / img_w > 0.9
-                )
-
-                if words:
-                    y_tol = max(15, row.height)
-                    if is_single_full_column:
-                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
-                        logger.info(f"R{row_idx:02d}: {len(words)} words, "
-                                    f"text={text!r:.100}")
-                    else:
-                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-                else:
-                    text = ''
-                    avg_conf = 0.0
-                    if is_single_full_column:
-                        logger.info(f"R{row_idx:02d}: 0 words (row has "
-                                    f"{row.word_count} total, y={row.y}..{row.y+row.height})")
-
-                # Apply noise filter — but NOT for single-column sub-sessions:
-                # 1. _clean_cell_text strips trailing non-alpha tokens (e.g. €0.50,
-                #    £1, €2.50) which are valid content in box layouts.
-                # 2. _clean_cell_text joins tokens with single space, destroying
-                #    the proportional spacing from _words_to_spaced_text.
-                if not is_single_full_column:
-                    text = _clean_cell_text(text)
-
-                cell = {
-                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
-                    'row_index': row_idx,
-                    'col_index': col_idx,
-                    'col_type': col.type,
-                    'text': text,
-                    'confidence': avg_conf,
-                    'bbox_px': {
-                        'x': col.x, 'y': row.y,
-                        'w': col.width, 'h': row.height,
-                    },
-                    'bbox_pct': {
-                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
-                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
-                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
-                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
-                    },
-                    'ocr_engine': 'word_lookup',
-                    'is_bold': False,
-                }
-                # Store word bounding boxes for pixel-accurate overlay
-                if words and text.strip():
-                    cell['word_boxes'] = [
-                        {
-                            'text': w.get('text', ''),
-                            'left': w['left'],
-                            'top': w['top'],
-                            'width': w['width'],
-                            'height': w['height'],
-                            'conf': w.get('conf', 0),
-                        }
-                        for w in words
-                        if w.get('text', '').strip()
-                    ]
-                cells.append(cell)
-
-    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
-    narrow_tasks = []
-    for row_idx, row in enumerate(content_rows):
-        for col_idx, col in enumerate(relevant_cols):
-            if col_idx in narrow_col_indices:
-                narrow_tasks.append((row_idx, col_idx, row, col))
-
-    if narrow_tasks:
-        max_workers = 4 if engine_name == "tesseract" else 2
-        with ThreadPoolExecutor(max_workers=max_workers) as pool:
-            futures = {
-                pool.submit(
-                    _ocr_cell_crop,
-                    ri, ci, row, col,
-                    ocr_img, img_bgr, img_w, img_h,
-                    engine_name, lang, lang_map,
-                ): (ri, ci)
-                for ri, ci, row, col in narrow_tasks
-            }
-            for future in as_completed(futures):
-                try:
-                    cell = future.result()
-                    cells.append(cell)
-                except Exception as e:
-                    ri, ci = futures[future]
-                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
-
-    # Sort cells by (row_index, col_index)
-    cells.sort(key=lambda c: (c['row_index'], c['col_index']))
-
-    # Remove all-empty rows
-    rows_with_text: set = set()
-    for cell in cells:
-        if cell['text'].strip():
-            rows_with_text.add(cell['row_index'])
-    before_filter = len(cells)
-    cells = [c for c in cells if c['row_index'] in rows_with_text]
-    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
-    if empty_rows_removed > 0:
-        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
-
-    # Bold detection disabled: cell-level stroke-width analysis cannot
-    # distinguish bold from non-bold when cells contain mixed formatting
-    # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
-    # TODO: word-level bold detection would require per-word bounding boxes.
-
-    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
-                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
-                f"engine={engine_name} (hybrid)")
-
-    return cells, columns_meta
-
-
-def build_cell_grid_v2_streaming(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
-    """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.
-
-    Yields:
-        (cell_dict, columns_meta, total_cells)
-    """
-    # Resolve engine — default to Tesseract for cell-first OCR.
-    # Tesseract excels at isolated text crops (binarized, upscaled).
-    # RapidOCR is optimized for full-page scene-text and produces artifacts
-    # on small cell crops (extra chars, missing punctuation, garbled IPA).
-    use_rapid = False
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "auto":
-        engine_name = "tesseract"
-    elif ocr_engine == "rapid":
-        if not RAPIDOCR_AVAILABLE:
-            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
-        else:
-            use_rapid = True
-        engine_name = "rapid" if use_rapid else "tesseract"
-    else:
-        engine_name = "tesseract"
-
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        return
-
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    if not content_rows:
-        return
-
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
-                   'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        return
-
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    if not content_rows:
-        return
-
-    # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
-    content_rows.sort(key=lambda r: r.y)
-    header_rows = [r for r in row_geometries if r.row_type == 'header']
-    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
-    if header_rows:
-        top_bound = max(r.y + r.height for r in header_rows)
-    else:
-        top_bound = content_rows[0].y
-    if footer_rows:
-        bottom_bound = min(r.y for r in footer_rows)
-    else:
-        bottom_bound = content_rows[-1].y + content_rows[-1].height
-
-    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
-    relevant_cols.sort(key=lambda c: c.x)
-
-    columns_meta = [
-        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
-        for ci, c in enumerate(relevant_cols)
-    ]
-
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    total_cells = len(content_rows) * len(relevant_cols)
-
-    for row_idx, row in enumerate(content_rows):
-        for col_idx, col in enumerate(relevant_cols):
-            cell = _ocr_cell_crop(
-                row_idx, col_idx, row, col,
-                ocr_img, img_bgr, img_w, img_h,
-                engine_name, lang, lang_map,
-            )
-            yield cell, columns_meta, total_cells
-
-
-# ---------------------------------------------------------------------------
-# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
-# ---------------------------------------------------------------------------
-
-def _compute_cell_padding(col_width: int, img_w: int) -> int:
-    """Adaptive padding for OCR crops based on column width.
-
-    Narrow columns (page_ref, marker) need more surrounding context so
-    Tesseract can segment characters correctly.  Wide columns keep the
-    minimal 4 px padding to avoid pulling in neighbours.
-    """
-    col_pct = col_width / img_w * 100 if img_w > 0 else 100
-    if col_pct < 5:
-        return max(20, col_width // 2)
-    if col_pct < 10:
-        return max(12, col_width // 4)
-    if col_pct < 15:
-        return 8
-    return 4
-
-
-def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
-                               max_scale: int = 3) -> np.ndarray:
-    """Upscale tiny crops so Tesseract gets enough pixel data.
-
-    If either dimension is below *min_dim*, the crop is bicubic-upscaled
-    so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
-    """
-    h, w = crop.shape[:2]
-    if h >= min_dim and w >= min_dim:
-        return crop
-    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
-    if scale <= 1.0:
-        return crop
-    new_w = int(w * scale)
-    new_h = int(h * scale)
-    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
-
-
-def _select_psm_for_column(col_type: str, col_width: int,
-                            row_height: int) -> int:
-    """Choose the best Tesseract PSM for a given column geometry.
-
-    - page_ref columns are almost always single short tokens → PSM 8
-    - Very narrow or short cells → PSM 7 (single text line)
-    - Everything else → PSM 6 (uniform block)
-    """
-    if col_type in ('page_ref', 'marker'):
-        return 8  # single word
-    if col_width < 100 or row_height < 30:
-        return 7  # single line
-    return 6  # uniform block
-
-
-def _ocr_single_cell(
-    row_idx: int,
-    col_idx: int,
-    row: RowGeometry,
-    col: PageRegion,
-    ocr_img: np.ndarray,
-    img_bgr: Optional[np.ndarray],
-    img_w: int,
-    img_h: int,
-    use_rapid: bool,
-    engine_name: str,
-    lang: str,
-    lang_map: Dict[str, str],
-    preassigned_words: Optional[List[Dict]] = None,
-) -> Dict[str, Any]:
-    """Populate a single cell (column x row intersection) via word lookup."""
-    # Display bbox: exact column × row intersection (no padding)
-    disp_x = col.x
-    disp_y = row.y
-    disp_w = col.width
-    disp_h = row.height
-
-    # OCR crop: adaptive padding — narrow columns get more context
-    pad = _compute_cell_padding(col.width, img_w)
-    cell_x = max(0, col.x - pad)
-    cell_y = max(0, row.y - pad)
-    cell_w = min(col.width + 2 * pad, img_w - cell_x)
-    cell_h = min(row.height + 2 * pad, img_h - cell_y)
-    is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
-
-    if disp_w <= 0 or disp_h <= 0:
-        return {
-            'cell_id': f"R{row_idx:02d}_C{col_idx}",
-            'row_index': row_idx,
-            'col_index': col_idx,
-            'col_type': col.type,
-            'text': '',
-            'confidence': 0.0,
-            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
-            'bbox_pct': {
-                'x': round(col.x / img_w * 100, 2),
-                'y': round(row.y / img_h * 100, 2),
-                'w': round(col.width / img_w * 100, 2),
-                'h': round(row.height / img_h * 100, 2),
-            },
-            'ocr_engine': 'word_lookup',
-        }
-
-    # --- PRIMARY: Word-lookup from full-page Tesseract ---
-    words = preassigned_words if preassigned_words is not None else []
-    used_engine = 'word_lookup'
-
-    # Filter low-confidence words (OCR noise from images/artifacts).
-    # Tesseract gives low confidence to misread image edges, borders,
-    # and other non-text elements.
-    if words:
-        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
-    if words:
-        # Use row height as Y-tolerance so all words within a single row
-        # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
-        # across two lines due to slight vertical offset).
-        y_tol = max(15, row.height)
-        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
-    else:
-        text = ''
-        avg_conf = 0.0
-
-    # --- FALLBACK: Cell-OCR for empty cells ---
-    # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
-    # Re-run OCR on the cell crop to catch what word-lookup missed.
-    # To avoid wasting time on truly empty cells, check pixel density first:
-    # only run Tesseract if the cell crop contains enough dark pixels to
-    # plausibly contain text.
-    _run_fallback = False
-    if not text.strip() and cell_w > 0 and cell_h > 0:
-        if ocr_img is not None:
-            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
-            if crop.size > 0:
-                # Threshold: pixels darker than 180 (on 0-255 grayscale).
-                # Use 0.5% to catch even small text like "Ei" (2 chars)
-                # in an otherwise empty cell.
-                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
-                _run_fallback = dark_ratio > 0.005
-    if _run_fallback:
-        # For narrow columns, upscale the crop before OCR
-        if is_narrow and ocr_img is not None:
-            _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
-            _upscaled = _ensure_minimum_crop_size(_crop_slice)
-            if _upscaled is not _crop_slice:
-                # Build a temporary full-size image with the upscaled crop
-                # placed at origin so ocr_region can crop it cleanly.
-                _up_h, _up_w = _upscaled.shape[:2]
-                _tmp_region = PageRegion(
-                    type=col.type, x=0, y=0, width=_up_w, height=_up_h,
-                )
-                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
-                cell_lang = lang_map.get(col.type, lang)
-                fallback_words = ocr_region(_upscaled, _tmp_region,
-                                            lang=cell_lang, psm=_cell_psm)
-                # Remap word positions back to original image coordinates
-                _sx = cell_w / max(_up_w, 1)
-                _sy = cell_h / max(_up_h, 1)
-                for _fw in (fallback_words or []):
-                    _fw['left'] = int(_fw['left'] * _sx) + cell_x
-                    _fw['top'] = int(_fw['top'] * _sy) + cell_y
-                    _fw['width'] = int(_fw['width'] * _sx)
-                    _fw['height'] = int(_fw['height'] * _sy)
-            else:
-                # No upscaling needed, use adaptive PSM
-                cell_region = PageRegion(
-                    type=col.type, x=cell_x, y=cell_y,
-                    width=cell_w, height=cell_h,
-                )
-                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
-                cell_lang = lang_map.get(col.type, lang)
-                fallback_words = ocr_region(ocr_img, cell_region,
-                                            lang=cell_lang, psm=_cell_psm)
-        else:
-            cell_region = PageRegion(
-                type=col.type,
-                x=cell_x, y=cell_y,
-                width=cell_w, height=cell_h,
-            )
-            if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
-                fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
-            elif engine_name == "lighton" and img_bgr is not None:
-                fallback_words = ocr_region_lighton(img_bgr, cell_region)
-            elif use_rapid and img_bgr is not None:
-                fallback_words = ocr_region_rapid(img_bgr, cell_region)
-            else:
-                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
-                cell_lang = lang_map.get(col.type, lang)
-                fallback_words = ocr_region(ocr_img, cell_region,
-                                            lang=cell_lang, psm=_cell_psm)
-
-        if fallback_words:
-            # Apply same confidence filter to fallback words
-            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-        if fallback_words:
-            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
-            fb_y_tol = max(10, int(fb_avg_h * 0.5))
-            fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
-            if fb_text.strip():
-                text = fb_text
-                avg_conf = round(
-                    sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
-                )
-                used_engine = 'cell_ocr_fallback'
-
-        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
-        if not text.strip() and _run_fallback and not use_rapid:
-            _fb_region = PageRegion(
-                type=col.type, x=cell_x, y=cell_y,
-                width=cell_w, height=cell_h,
-            )
-            cell_lang = lang_map.get(col.type, lang)
-            psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
-            if psm7_words:
-                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-            if psm7_words:
-                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
-                if p7_text.strip():
-                    text = p7_text
-                    avg_conf = round(
-                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
-                    )
-                    used_engine = 'cell_ocr_psm7'
-
-        # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
-        # If a narrow cell is still empty, OCR the entire row strip with
-        # RapidOCR (which handles small text better) and assign words by
-        # X-position overlap with this column.
-        if not text.strip() and is_narrow and img_bgr is not None:
-            row_region = PageRegion(
-                type='_row_strip', x=0, y=row.y,
-                width=img_w, height=row.height,
-            )
-            strip_words = ocr_region_rapid(img_bgr, row_region)
-            if strip_words:
-                # Filter to words overlapping this column's X-range
-                col_left = col.x
-                col_right = col.x + col.width
-                col_words = []
-                for sw in strip_words:
-                    sw_left = sw.get('left', 0)
-                    sw_right = sw_left + sw.get('width', 0)
-                    overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
-                    if overlap > sw.get('width', 1) * 0.3:
-                        col_words.append(sw)
-                if col_words:
-                    col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
-                if col_words:
-                    rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
-                    if rs_text.strip():
-                        text = rs_text
-                        avg_conf = round(
-                            sum(w['conf'] for w in col_words) / len(col_words), 1
-                        )
-                        used_engine = 'row_strip_rapid'
-
-    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
-    if text.strip():
-        text = _clean_cell_text(text)
-        if not text:
-            avg_conf = 0.0
-
-    return {
-        'cell_id': f"R{row_idx:02d}_C{col_idx}",
-        'row_index': row_idx,
-        'col_index': col_idx,
-        'col_type': col.type,
-        'text': text,
-        'confidence': avg_conf,
-        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
-        'bbox_pct': {
-            'x': round(disp_x / img_w * 100, 2),
-            'y': round(disp_y / img_h * 100, 2),
-            'w': round(disp_w / img_w * 100, 2),
-            'h': round(disp_h / img_h * 100, 2),
-        },
-        'ocr_engine': used_engine,
-    }
-
-
-def _is_artifact_row(row: RowGeometry) -> bool:
-    """Return True if this row contains only scan artifacts, not real text.
-
-    Artifact rows (scanner shadows, noise) typically produce only single-character
-    detections. A real content row always has at least one token with 2+ characters.
-    """
-    if row.word_count == 0:
-        return True
-    texts = [w.get('text', '').strip() for w in row.words]
-    return all(len(t) <= 1 for t in texts)
-
-
-def _heal_row_gaps(
-    rows: List[RowGeometry],
-    top_bound: int,
-    bottom_bound: int,
-) -> None:
-    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
-
-    After filtering out empty or artifact rows, remaining content rows may have
-    gaps between them where the removed rows used to be. This function mutates
-    each row to extend upward/downward to the midpoint of such gaps so that
-    OCR crops cover the full available content area.
-
-    The first row always extends to top_bound; the last row to bottom_bound.
-    """
-    if not rows:
-        return
-    rows.sort(key=lambda r: r.y)
-    n = len(rows)
-    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
-
-    for i, row in enumerate(rows):
-        # New top: midpoint between previous row's bottom and this row's top
-        if i == 0:
-            new_top = top_bound
-        else:
-            prev_bot = orig[i - 1][1]
-            my_top = orig[i][0]
-            gap = my_top - prev_bot
-            new_top = prev_bot + gap // 2 if gap > 1 else my_top
-
-        # New bottom: midpoint between this row's bottom and next row's top
-        if i == n - 1:
-            new_bottom = bottom_bound
-        else:
-            my_bot = orig[i][1]
-            next_top = orig[i + 1][0]
-            gap = next_top - my_bot
-            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
-
-        row.y = new_top
-        row.height = max(5, new_bottom - new_top)
-
-    logger.debug(
-        f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
-        f"(bounds: top={top_bound}, bottom={bottom_bound})"
-    )
-
-
-def build_cell_grid(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-    """Generic Cell-Grid: Columns × Rows → cells with OCR text.
-
-    This is the layout-agnostic foundation. Every column (except column_ignore)
-    is intersected with every content row to produce numbered cells.
-
-    Args:
-        ocr_img: Binarized full-page image (for Tesseract).
-        column_regions: Classified columns from Step 3 (PageRegion list).
-        row_geometries: Rows from Step 4 (RowGeometry list).
-        img_w: Image width in pixels.
-        img_h: Image height in pixels.
-        lang: Default Tesseract language.
-        ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
-        img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
-
-    Returns:
-        (cells, columns_meta) where cells is a list of cell dicts and
-        columns_meta describes the columns used.
-    """
-    # Resolve engine choice
-    use_rapid = False
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
-        engine_name = "rapid" if use_rapid else "tesseract"
-    elif ocr_engine == "rapid":
-        if not RAPIDOCR_AVAILABLE:
-            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
-        else:
-            use_rapid = True
-        engine_name = "rapid" if use_rapid else "tesseract"
-    else:
-        engine_name = "tesseract"
-
-    logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
-
-    # Filter to content rows only (skip header/footer)
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        logger.warning("build_cell_grid: no content rows found")
-        return [], []
-
-    # Filter phantom rows: rows with no Tesseract words assigned are
-    # inter-line whitespace gaps that would produce garbage OCR.
-    before = len(content_rows)
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    skipped = before - len(content_rows)
-    if skipped > 0:
-        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
-    if not content_rows:
-        logger.warning("build_cell_grid: no content rows with words found")
-        return [], []
-
-    # Use columns only — skip ignore, header, footer, page_ref
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        logger.warning("build_cell_grid: no usable columns found")
-        return [], []
-
-    # Filter artifact rows: rows whose detected words are all single characters
-    # are caused by scanner shadows or noise, not real text.
-    before_art = len(content_rows)
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    artifact_skipped = before_art - len(content_rows)
-    if artifact_skipped > 0:
-        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
-    if not content_rows:
-        logger.warning("build_cell_grid: no content rows after artifact filtering")
-        return [], []
-
-    # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
-    # to fill the space so OCR crops are not artificially narrow.
-    _heal_row_gaps(
-        content_rows,
-        top_bound=min(c.y for c in relevant_cols),
-        bottom_bound=max(c.y + c.height for c in relevant_cols),
-    )
-
-    # Sort columns left-to-right
-    relevant_cols.sort(key=lambda c: c.x)
-
-    # Build columns_meta
-    columns_meta = [
-        {
-            'index': col_idx,
-            'type': col.type,
-            'x': col.x,
-            'width': col.width,
-        }
-        for col_idx, col in enumerate(relevant_cols)
-    ]
-
-    # Choose OCR language per column type (Tesseract only)
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    cells: List[Dict[str, Any]] = []
-
-    for row_idx, row in enumerate(content_rows):
-        # Pre-assign each word to exactly one column (nearest center)
-        col_words = _assign_row_words_to_columns(row, relevant_cols)
-        for col_idx, col in enumerate(relevant_cols):
-            cell = _ocr_single_cell(
-                row_idx, col_idx, row, col,
-                ocr_img, img_bgr, img_w, img_h,
-                use_rapid, engine_name, lang, lang_map,
-                preassigned_words=col_words[col_idx],
-            )
-            cells.append(cell)
-
-    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
-    # Collect cells that are still empty but have visible pixels.
-    # Instead of calling Tesseract once per cell (expensive), crop an entire
-    # column strip and run OCR once, then assign words to cells by Y position.
-    empty_by_col: Dict[int, List[int]] = {}  # col_idx → [cell list indices]
-    for ci, cell in enumerate(cells):
-        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
-            bpx = cell['bbox_px']
-            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
-            if w > 0 and h > 0 and ocr_img is not None:
-                crop = ocr_img[y:y + h, x:x + w]
-                if crop.size > 0:
-                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
-                    if dark_ratio > 0.005:
-                        empty_by_col.setdefault(cell['col_index'], []).append(ci)
-
-    for col_idx, cell_indices in empty_by_col.items():
-        if len(cell_indices) < 3:
-            continue  # Not worth batching for < 3 cells
-
-        # Find the column strip bounding box (union of all empty cell bboxes)
-        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
-        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
-        col_x = cells[cell_indices[0]]['bbox_px']['x']
-        col_w = cells[cell_indices[0]]['bbox_px']['w']
-
-        strip_region = PageRegion(
-            type=relevant_cols[col_idx].type,
-            x=col_x, y=min_y,
-            width=col_w, height=max_y_h - min_y,
-        )
-        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
-
-        if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
-            strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
-        elif engine_name == "lighton" and img_bgr is not None:
-            strip_words = ocr_region_lighton(img_bgr, strip_region)
-        elif use_rapid and img_bgr is not None:
-            strip_words = ocr_region_rapid(img_bgr, strip_region)
-        else:
-            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
-
-        if not strip_words:
-            continue
-
-        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
-        if not strip_words:
-            continue
-
-        # Assign words to cells by Y overlap
-        for ci in cell_indices:
-            cell_y = cells[ci]['bbox_px']['y']
-            cell_h = cells[ci]['bbox_px']['h']
-            cell_mid_y = cell_y + cell_h / 2
-
-            matched_words = [
-                w for w in strip_words
-                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
-            ]
-            if matched_words:
-                matched_words.sort(key=lambda w: w['left'])
-                batch_text = ' '.join(w['text'] for w in matched_words)
-                batch_text = _clean_cell_text(batch_text)
-                if batch_text.strip():
-                    cells[ci]['text'] = batch_text
-                    cells[ci]['confidence'] = round(
-                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
-                    )
-                    cells[ci]['ocr_engine'] = 'batch_column_ocr'
-
-        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
-        if batch_filled > 0:
-            logger.info(
-                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
-                f"empty cells in column {col_idx}"
-            )
-
-    # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
-    # that had stray Tesseract artifacts giving word_count > 0).
-    rows_with_text: set = set()
-    for cell in cells:
-        if cell['text'].strip():
-            rows_with_text.add(cell['row_index'])
-    before_filter = len(cells)
-    cells = [c for c in cells if c['row_index'] in rows_with_text]
-    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
-    if empty_rows_removed > 0:
-        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
-
-    logger.info(f"build_cell_grid: {len(cells)} cells from "
-                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
-                f"engine={engine_name}")
-
-    return cells, columns_meta
-
-
-def build_cell_grid_streaming(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
-    """Like build_cell_grid(), but yields each cell as it is OCR'd.
-
-    Yields:
-        (cell_dict, columns_meta, total_cells) for each cell.
-    """
-    # Resolve engine choice (same as build_cell_grid)
-    use_rapid = False
-    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
-        engine_name = ocr_engine
-    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
-        engine_name = "rapid" if use_rapid else "tesseract"
-    elif ocr_engine == "rapid":
-        if not RAPIDOCR_AVAILABLE:
-            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
-        else:
-            use_rapid = True
-        engine_name = "rapid" if use_rapid else "tesseract"
-    else:
-        engine_name = "tesseract"
-
-    content_rows = [r for r in row_geometries if r.row_type == 'content']
-    if not content_rows:
-        return
-
-    # Filter phantom rows: rows with no Tesseract words assigned are
-    # inter-line whitespace gaps that would produce garbage OCR.
-    before = len(content_rows)
-    content_rows = [r for r in content_rows if r.word_count > 0]
-    skipped = before - len(content_rows)
-    if skipped > 0:
-        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
-    if not content_rows:
-        return
-
-    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
-    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
-    if not relevant_cols:
-        return
-
-    # Filter artifact rows + heal gaps (same logic as build_cell_grid)
-    before_art = len(content_rows)
-    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
-    artifact_skipped = before_art - len(content_rows)
-    if artifact_skipped > 0:
-        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
-    if not content_rows:
-        return
-    _heal_row_gaps(
-        content_rows,
-        top_bound=min(c.y for c in relevant_cols),
-        bottom_bound=max(c.y + c.height for c in relevant_cols),
-    )
-
-    relevant_cols.sort(key=lambda c: c.x)
-
-    columns_meta = [
-        {
-            'index': col_idx,
-            'type': col.type,
-            'x': col.x,
-            'width': col.width,
-        }
-        for col_idx, col in enumerate(relevant_cols)
-    ]
-
-    lang_map = {
-        'column_en': 'eng',
-        'column_de': 'deu',
-        'column_example': 'eng+deu',
-    }
-
-    total_cells = len(content_rows) * len(relevant_cols)
-
-    for row_idx, row in enumerate(content_rows):
-        # Pre-assign each word to exactly one column (nearest center)
-        col_words = _assign_row_words_to_columns(row, relevant_cols)
-        for col_idx, col in enumerate(relevant_cols):
-            cell = _ocr_single_cell(
-                row_idx, col_idx, row, col,
-                ocr_img, img_bgr, img_w, img_h,
-                use_rapid, engine_name, lang, lang_map,
-                preassigned_words=col_words[col_idx],
-            )
-            yield cell, columns_meta, total_cells
-
-
-def _cells_to_vocab_entries(
-    cells: List[Dict[str, Any]],
-    columns_meta: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Map generic cells to vocab entries with english/german/example fields.
-
-    Groups cells by row_index, maps col_type → field name, and produces
-    one entry per row (only rows with at least one non-empty field).
-    """
-    # Determine image dimensions from first cell (for row-level bbox)
-    col_type_to_field = {
-        'column_en': 'english',
-        'column_de': 'german',
-        'column_example': 'example',
-        'page_ref': 'source_page',
-        'column_marker': 'marker',
-        'column_text': 'text',  # generic single-column (box sub-sessions)
-    }
-    bbox_key_map = {
-        'column_en': 'bbox_en',
-        'column_de': 'bbox_de',
-        'column_example': 'bbox_ex',
-        'page_ref': 'bbox_ref',
-        'column_marker': 'bbox_marker',
-        'column_text': 'bbox_text',
-    }
-
-    # Group cells by row_index
-    rows: Dict[int, List[Dict]] = {}
-    for cell in cells:
-        ri = cell['row_index']
-        rows.setdefault(ri, []).append(cell)
-
-    entries: List[Dict[str, Any]] = []
-    for row_idx in sorted(rows.keys()):
-        row_cells = rows[row_idx]
-        entry: Dict[str, Any] = {
-            'row_index': row_idx,
-            'english': '',
-            'german': '',
-            'example': '',
-            'text': '',  # generic single-column (box sub-sessions)
-            'source_page': '',
-            'marker': '',
-            'confidence': 0.0,
-            'bbox': None,
-            'bbox_en': None,
-            'bbox_de': None,
-            'bbox_ex': None,
-            'bbox_ref': None,
-            'bbox_marker': None,
-            'bbox_text': None,
-            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
-        }
-
-        confidences = []
-        for cell in row_cells:
-            col_type = cell['col_type']
-            field = col_type_to_field.get(col_type)
-            if field:
-                entry[field] = cell['text']
-            bbox_field = bbox_key_map.get(col_type)
-            if bbox_field:
-                entry[bbox_field] = cell['bbox_pct']
-            if cell['confidence'] > 0:
-                confidences.append(cell['confidence'])
-
-        # Compute row-level bbox as union of all cell bboxes
-        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
-        if all_bboxes:
-            min_x = min(b['x'] for b in all_bboxes)
-            min_y = min(b['y'] for b in all_bboxes)
-            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
-            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
-            entry['bbox'] = {
-                'x': round(min_x, 2),
-                'y': round(min_y, 2),
-                'w': round(max_x2 - min_x, 2),
-                'h': round(max_y2 - min_y, 2),
-            }
-
-        entry['confidence'] = round(
-            sum(confidences) / len(confidences), 1
-        ) if confidences else 0.0
-
-        # Only include if at least one mapped field has text
-        has_content = any(
-            entry.get(f)
-            for f in col_type_to_field.values()
-        )
-        if has_content:
-            entries.append(entry)
-
-    return entries
-
-
-# Regex: line starts with phonetic bracket content only (no real word before it)
-_PHONETIC_ONLY_RE = re.compile(
-    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+# --- v2 build (current default) ---
+from cv_cell_grid_build import (  # noqa: F401
+    _NARROW_COL_THRESHOLD_PCT,
+    _ocr_cell_crop,
+    build_cell_grid_v2,
 )
 
+# --- Legacy build (DEPRECATED) ---
+from cv_cell_grid_legacy import (  # noqa: F401
+    _ocr_single_cell,
+    build_cell_grid,
+)
 
-def _is_phonetic_only_text(text: str) -> bool:
-    """Check if text consists only of phonetic transcription.
+# --- Streaming variants ---
+from cv_cell_grid_streaming import (  # noqa: F401
+    build_cell_grid_streaming,
+    build_cell_grid_v2_streaming,
+)
 
-    Phonetic-only patterns:
-      ['mani serva]   →  True
-      [dɑːns]         →  True
-      ["a:mand]       →  True
-      almond ['a:mand] → False (has real word before bracket)
-      Mandel           → False
-    """
-    t = text.strip()
-    if not t:
-        return False
-    # Must contain at least one bracket
-    if '[' not in t and ']' not in t:
-        return False
-    # Remove all bracket content and surrounding punctuation/whitespace
-    without_brackets = re.sub(r"\[.*?\]", '', t)
-    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
-    # If nothing meaningful remains, it's phonetic-only
-    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
-    return len(alpha_remaining) < 2
-
-
-def _merge_phonetic_continuation_rows(
-    entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Merge rows that contain only phonetic transcription into previous entry.
-
-    In dictionary pages, phonetic transcription sometimes wraps to the next
-    row.  E.g.:
-      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
-      Row 29: EN="['mani serva]"       DE=""
-
-    Row 29 is phonetic-only → merge into row 28's EN field.
-    """
-    if len(entries) < 2:
-        return entries
-
-    merged: List[Dict[str, Any]] = []
-    for entry in entries:
-        en = (entry.get('english') or '').strip()
-        de = (entry.get('german') or '').strip()
-        ex = (entry.get('example') or '').strip()
-
-        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
-        if merged and _is_phonetic_only_text(en) and not de:
-            prev = merged[-1]
-            prev_en = (prev.get('english') or '').strip()
-            # Append phonetic to previous entry's EN
-            if prev_en:
-                prev['english'] = prev_en + ' ' + en
-            else:
-                prev['english'] = en
-            # If there was an example, append to previous too
-            if ex:
-                prev_ex = (prev.get('example') or '').strip()
-                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
-            logger.debug(
-                f"Merged phonetic row {entry.get('row_index')} "
-                f"into previous entry: {prev['english']!r}"
-            )
-            continue
-
-        merged.append(entry)
-
-    return merged
-
-
-def _merge_wrapped_rows(
-    entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Merge rows where the primary column (EN) is empty — cell wrap continuation.
-
-    In textbook vocabulary tables, columns are often narrow, so the author
-    wraps text within a cell. OCR treats each physical line as a separate row.
-    The key indicator: if the EN column is empty but DE/example have text,
-    this row is a continuation of the previous row's cells.
-
-    Example (original textbook has ONE row):
-      Row 2: EN="take part (in)"  DE="teilnehmen (an), mitmachen"  EX="More than 200 singers took"
-      Row 3: EN=""                DE="(bei)"                        EX="part in the concert."
-      → Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="More than 200 singers took part in the concert."
-
-    Also handles the reverse case: DE empty but EN has text (wrap in EN column).
-    """
-    if len(entries) < 2:
-        return entries
-
-    merged: List[Dict[str, Any]] = []
-    for entry in entries:
-        en = (entry.get('english') or '').strip()
-        de = (entry.get('german') or '').strip()
-        ex = (entry.get('example') or '').strip()
-
-        if not merged:
-            merged.append(entry)
-            continue
-
-        prev = merged[-1]
-        prev_en = (prev.get('english') or '').strip()
-        prev_de = (prev.get('german') or '').strip()
-        prev_ex = (prev.get('example') or '').strip()
-
-        # Case 1: EN is empty → continuation of previous row
-        # (DE or EX have text that should be appended to previous row)
-        if not en and (de or ex) and prev_en:
-            if de:
-                if prev_de.endswith(','):
-                    sep = ' '  # "Wort," + " " + "Ausdruck"
-                elif prev_de.endswith(('-', '(')):
-                    sep = ''   # "teil-" + "nehmen" or "(" + "bei)"
-                else:
-                    sep = ' '
-                prev['german'] = (prev_de + sep + de).strip()
-            if ex:
-                sep = ' ' if prev_ex else ''
-                prev['example'] = (prev_ex + sep + ex).strip()
-            logger.debug(
-                f"Merged wrapped row {entry.get('row_index')} into previous "
-                f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
-            )
-            continue
-
-        # Case 2: DE is empty, EN has text that looks like continuation
-        # (starts with lowercase or is a parenthetical like "(bei)")
-        if en and not de and prev_de:
-            is_paren = en.startswith('(')
-            first_alpha = next((c for c in en if c.isalpha()), '')
-            starts_lower = first_alpha and first_alpha.islower()
-
-            if (is_paren or starts_lower) and len(en.split()) < 5:
-                sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
-                prev['english'] = (prev_en + sep + en).strip()
-                if ex:
-                    sep2 = ' ' if prev_ex else ''
-                    prev['example'] = (prev_ex + sep2 + ex).strip()
-                logger.debug(
-                    f"Merged wrapped row {entry.get('row_index')} into previous "
-                    f"(empty DE): EN={prev['english']!r}"
-                )
-                continue
-
-        merged.append(entry)
-
-    if len(merged) < len(entries):
-        logger.info(
-            f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
-            f"continuation rows ({len(entries)} → {len(merged)})"
-        )
-    return merged
-
-
-def _merge_continuation_rows(
-    entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Merge multi-line vocabulary entries where text wraps to the next row.
-
-    A row is a continuation of the previous entry when:
-    - EN has text, but DE is empty
-    - EN starts with a lowercase letter (not a new vocab entry)
-    - Previous entry's EN does NOT end with a sentence terminator (.!?)
-    - The continuation text has fewer than 4 words (not an example sentence)
-    - The row was not already merged as phonetic
-
-    Example:
-      Row 5: EN="to put up"       DE="aufstellen"
-      Row 6: EN="with sth."       DE=""
-      → Merged: EN="to put up with sth."  DE="aufstellen"
-    """
-    if len(entries) < 2:
-        return entries
-
-    merged: List[Dict[str, Any]] = []
-    for entry in entries:
-        en = (entry.get('english') or '').strip()
-        de = (entry.get('german') or '').strip()
-
-        if merged and en and not de:
-            # Check: not phonetic (already handled)
-            if _is_phonetic_only_text(en):
-                merged.append(entry)
-                continue
-
-            # Check: starts with lowercase
-            first_alpha = next((c for c in en if c.isalpha()), '')
-            starts_lower = first_alpha and first_alpha.islower()
-
-            # Check: fewer than 4 words (not an example sentence)
-            word_count = len(en.split())
-            is_short = word_count < 4
-
-            # Check: previous entry doesn't end with sentence terminator
-            prev = merged[-1]
-            prev_en = (prev.get('english') or '').strip()
-            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
-
-            if starts_lower and is_short and not prev_ends_sentence:
-                # Merge into previous entry
-                prev['english'] = (prev_en + ' ' + en).strip()
-                # Merge example if present
-                ex = (entry.get('example') or '').strip()
-                if ex:
-                    prev_ex = (prev.get('example') or '').strip()
-                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
-                logger.debug(
-                    f"Merged continuation row {entry.get('row_index')} "
-                    f"into previous entry: {prev['english']!r}"
-                )
-                continue
-
-        merged.append(entry)
-
-    return merged
-
-
-def build_word_grid(
-    ocr_img: np.ndarray,
-    column_regions: List[PageRegion],
-    row_geometries: List[RowGeometry],
-    img_w: int,
-    img_h: int,
-    lang: str = "eng+deu",
-    ocr_engine: str = "auto",
-    img_bgr: Optional[np.ndarray] = None,
-    pronunciation: str = "british",
-) -> List[Dict[str, Any]]:
-    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
-
-    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
-    - Maps cells to english/german/example entries
-    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
-    - Falls back to returning raw cells if no vocab columns detected.
-
-    Args:
-        ocr_img: Binarized full-page image (for Tesseract).
-        column_regions: Classified columns from Step 3.
-        row_geometries: Rows from Step 4.
-        img_w, img_h: Image dimensions.
-        lang: Default Tesseract language.
-        ocr_engine: 'tesseract', 'rapid', or 'auto'.
-        img_bgr: BGR color image (required for RapidOCR).
-        pronunciation: 'british' or 'american' for IPA lookup.
-
-    Returns:
-        List of entry dicts with english/german/example text and bbox info (percent).
-    """
-    cells, columns_meta = build_cell_grid(
-        ocr_img, column_regions, row_geometries, img_w, img_h,
-        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
-    )
-
-    if not cells:
-        return []
-
-    # Check if vocab layout is present
-    col_types = {c['type'] for c in columns_meta}
-    if not (col_types & {'column_en', 'column_de'}):
-        logger.info("build_word_grid: no vocab columns — returning raw cells")
-        return cells
-
-    # Vocab mapping: cells → entries
-    entries = _cells_to_vocab_entries(cells, columns_meta)
-
-    # --- Post-processing pipeline (deterministic, no LLM) ---
-    n_raw = len(entries)
-
-    # 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
-    entries = _merge_wrapped_rows(entries)
-
-    # 0a. Merge phonetic-only continuation rows into previous entry
-    entries = _merge_phonetic_continuation_rows(entries)
-
-    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
-    entries = _merge_continuation_rows(entries)
-
-    # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
-    #    llm_review_entries_streaming so changes are visible to the user in Step 6.
-
-    # 2. Replace OCR'd phonetics with dictionary IPA
-    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-
-    # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
-    entries = _split_comma_entries(entries)
-
-    # 4. Attach example sentences (rows without DE → examples for preceding entry)
-    entries = _attach_example_sentences(entries)
-
-    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
-    logger.info(f"build_word_grid: {len(entries)} entries from "
-                f"{n_raw} raw → {len(entries)} after post-processing "
-                f"(engine={engine_name})")
-
-    return entries
+# --- Row merging ---
+from cv_cell_grid_merge import (  # noqa: F401
+    _PHONETIC_ONLY_RE,
+    _is_phonetic_only_text,
+    _merge_continuation_rows,
+    _merge_phonetic_continuation_rows,
+    _merge_wrapped_rows,
+)
 
+# --- Vocab extraction ---
+from cv_cell_grid_vocab import (  # noqa: F401
+    _cells_to_vocab_entries,
+    build_word_grid,
+)
diff --git a/klausur-service/backend/cv_cell_grid_build.py b/klausur-service/backend/cv_cell_grid_build.py
new file mode 100644
index 0000000..9ac0ac5
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_build.py
@@ -0,0 +1,498 @@
+"""
+Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop).
+Extracted from cv_cell_grid.py.
+Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+    _clean_cell_text,
+    _clean_cell_text_lite,
+    _words_to_reading_order_text,
+    _words_to_spaced_text,
+    ocr_region_lighton,
+    ocr_region_rapid,
+    ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+    _MIN_WORD_CONF,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+
+# ---------------------------------------------------------------------------
+# _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode
+# ---------------------------------------------------------------------------
+
+def _ocr_cell_crop(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+) -> Dict[str, Any]:
+    """OCR a single cell by cropping the exact column x row intersection.
+
+    No padding beyond cell boundaries -> no neighbour bleeding.
+    """
+    # Display bbox: exact column x row intersection
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # Crop boundaries: add small internal padding (3px each side) to avoid
+    # clipping characters near column/row edges (e.g. parentheses, descenders).
+    # Stays within image bounds but may extend slightly beyond strict cell.
+    # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
+    _PAD = 3
+    cx = max(0, disp_x - _PAD)
+    cy = max(0, disp_y - _PAD)
+    cx2 = min(img_w, disp_x + disp_w + _PAD)
+    cy2 = min(img_h, disp_y + disp_h + _PAD)
+    cw = cx2 - cx
+    ch = cy2 - cy
+
+    empty_cell = {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': '',
+        'confidence': 0.0,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2) if img_w else 0,
+            'y': round(disp_y / img_h * 100, 2) if img_h else 0,
+            'w': round(disp_w / img_w * 100, 2) if img_w else 0,
+            'h': round(disp_h / img_h * 100, 2) if img_h else 0,
+        },
+        'ocr_engine': 'cell_crop_v2',
+        'is_bold': False,
+    }
+
+    if cw <= 0 or ch <= 0:
+        logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+        return empty_cell
+
+    # --- Pixel-density check: skip truly empty cells ---
+    if ocr_img is not None:
+        crop = ocr_img[cy:cy + ch, cx:cx + cw]
+        if crop.size > 0:
+            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+            if dark_ratio < 0.005:
+                logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+                            row_idx, col_idx, dark_ratio, cw, ch)
+                return empty_cell
+
+    # --- Prepare crop for OCR ---
+    cell_lang = lang_map.get(col.type, lang)
+    psm = _select_psm_for_column(col.type, col.width, row.height)
+    text = ''
+    avg_conf = 0.0
+    used_engine = 'cell_crop_v2'
+
+    if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_trocr(img_bgr, cell_region,
+                                 handwritten=(engine_name == "trocr-handwritten"))
+    elif engine_name == "lighton" and img_bgr is not None:
+        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+        words = ocr_region_lighton(img_bgr, cell_region)
+    elif engine_name == "rapid" and img_bgr is not None:
+        # Upscale small BGR crops for RapidOCR.
+        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+        if bgr_crop.size == 0:
+            words = []
+        else:
+            crop_h, crop_w = bgr_crop.shape[:2]
+            if crop_h < 80:
+                # Force 3x upscale for short rows — small chars need more pixels
+                scale = 3.0
+                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+                                    interpolation=cv2.INTER_CUBIC)
+            else:
+                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+            up_h, up_w = bgr_up.shape[:2]
+            scale_x = up_w / max(crop_w, 1)
+            scale_y = up_h / max(crop_h, 1)
+            was_scaled = (up_w != crop_w or up_h != crop_h)
+            logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region_rapid(bgr_up, tmp_region)
+            # Remap positions back to original image coords
+            if words and was_scaled:
+                for w in words:
+                    w['left'] = int(w['left'] / scale_x) + cx
+                    w['top'] = int(w['top'] / scale_y) + cy
+                    w['width'] = int(w['width'] / scale_x)
+                    w['height'] = int(w['height'] / scale_y)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+    else:
+        # Tesseract: upscale tiny crops for better recognition
+        if ocr_img is not None:
+            crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+            upscaled = _ensure_minimum_crop_size(crop_slice)
+            up_h, up_w = upscaled.shape[:2]
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
+            # Remap word positions back to original image coordinates
+            if words and (up_w != cw or up_h != ch):
+                sx = cw / max(up_w, 1)
+                sy = ch / max(up_h, 1)
+                for w in words:
+                    w['left'] = int(w['left'] * sx) + cx
+                    w['top'] = int(w['top'] * sy) + cy
+                    w['width'] = int(w['width'] * sx)
+                    w['height'] = int(w['height'] * sy)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
+        else:
+            words = []
+
+    # Filter low-confidence words
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        y_tol = max(15, ch)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+    else:
+        logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+                    row_idx, col_idx, cw, ch, psm, engine_name)
+
+    # --- PSM 7 fallback for still-empty Tesseract cells ---
+    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
+        crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+        upscaled = _ensure_minimum_crop_size(crop_slice)
+        up_h, up_w = upscaled.shape[:2]
+        tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+        psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
+        if psm7_words:
+            psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if psm7_words:
+            p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+            if p7_text.strip():
+                text = p7_text
+                avg_conf = round(
+                    sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                )
+                used_engine = 'cell_crop_v2_psm7'
+                # Remap PSM7 word positions back to original image coords
+                if up_w != cw or up_h != ch:
+                    sx = cw / max(up_w, 1)
+                    sy = ch / max(up_h, 1)
+                    for w in psm7_words:
+                        w['left'] = int(w['left'] * sx) + cx
+                        w['top'] = int(w['top'] * sy) + cy
+                        w['width'] = int(w['width'] * sx)
+                        w['height'] = int(w['height'] * sy)
+                else:
+                    for w in psm7_words:
+                        w['left'] += cx
+                        w['top'] += cy
+                words = psm7_words
+
+    # --- Noise filter ---
+    if text.strip():
+        pre_filter = text
+        text = _clean_cell_text_lite(text)
+        if not text:
+            logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+                        row_idx, col_idx, pre_filter)
+            avg_conf = 0.0
+
+    result = dict(empty_cell)
+    result['text'] = text
+    result['confidence'] = avg_conf
+    result['ocr_engine'] = used_engine
+
+    # Store individual word bounding boxes (absolute image coordinates)
+    # for pixel-accurate overlay positioning in the frontend.
+    if words and text.strip():
+        result['word_boxes'] = [
+            {
+                'text': w.get('text', ''),
+                'left': w['left'],
+                'top': w['top'],
+                'width': w['width'],
+                'height': w['height'],
+                'conf': w.get('conf', 0),
+            }
+            for w in words
+            if w.get('text', '').strip()
+        ]
+
+    return result
+
+
+# Threshold: columns narrower than this (% of image width) use single-cell
+# crop OCR instead of full-page word assignment.
+_NARROW_COL_THRESHOLD_PCT = 15.0
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2 — hybrid grid builder (current default)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+    skip_heal_gaps: bool = False,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
+
+    Drop-in replacement for build_cell_grid() -- same signature & return type.
+
+    Strategy:
+    - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
+      words (from row.words). Handles IPA brackets, punctuation, sentence
+      continuity correctly.
+    - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
+      neighbour bleeding from adjacent broad columns.
+    """
+    engine_name = "tesseract"
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
+        engine_name = "rapid"
+
+    logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
+
+    # Filter to content rows only
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows found")
+        return [], []
+
+    # Filter phantom rows (word_count=0) and artifact rows
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows with words found")
+        return [], []
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
+        return [], []
+
+    # Filter columns
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid_v2: no usable columns found")
+        return [], []
+
+    # Heal row gaps -- use header/footer boundaries
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    # skip_heal_gaps: When True, keep cell positions at their exact row geometry
+    # positions without expanding to fill gaps from removed rows.
+    if not skip_heal_gaps:
+        _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    # --- Classify columns as broad vs narrow ---
+    narrow_col_indices = set()
+    for ci, col in enumerate(relevant_cols):
+        col_pct = (col.width / img_w * 100) if img_w > 0 else 0
+        if col_pct < _NARROW_COL_THRESHOLD_PCT:
+            narrow_col_indices.add(ci)
+
+    broad_col_count = len(relevant_cols) - len(narrow_col_indices)
+    logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
+                f"{len(narrow_col_indices)} narrow columns (cell-crop)")
+
+    # --- Phase 1: Broad columns via full-page word assignment ---
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        # Assign full-page words to columns for this row
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx not in narrow_col_indices:
+                # BROAD column: use pre-assigned full-page words
+                words = col_words.get(col_idx, [])
+                # Filter low-confidence words
+                words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+                # Single full-width column (box sub-session): preserve spacing
+                is_single_full_column = (
+                    len(relevant_cols) == 1
+                    and img_w > 0
+                    and relevant_cols[0].width / img_w > 0.9
+                )
+
+                if words:
+                    y_tol = max(15, row.height)
+                    if is_single_full_column:
+                        text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
+                        logger.info(f"R{row_idx:02d}: {len(words)} words, "
+                                    f"text={text!r:.100}")
+                    else:
+                        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+                    avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+                else:
+                    text = ''
+                    avg_conf = 0.0
+                    if is_single_full_column:
+                        logger.info(f"R{row_idx:02d}: 0 words (row has "
+                                    f"{row.word_count} total, y={row.y}..{row.y+row.height})")
+
+                # Apply noise filter -- but NOT for single-column sub-sessions
+                if not is_single_full_column:
+                    text = _clean_cell_text(text)
+
+                cell = {
+                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
+                    'row_index': row_idx,
+                    'col_index': col_idx,
+                    'col_type': col.type,
+                    'text': text,
+                    'confidence': avg_conf,
+                    'bbox_px': {
+                        'x': col.x, 'y': row.y,
+                        'w': col.width, 'h': row.height,
+                    },
+                    'bbox_pct': {
+                        'x': round(col.x / img_w * 100, 2) if img_w else 0,
+                        'y': round(row.y / img_h * 100, 2) if img_h else 0,
+                        'w': round(col.width / img_w * 100, 2) if img_w else 0,
+                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
+                    },
+                    'ocr_engine': 'word_lookup',
+                    'is_bold': False,
+                }
+                # Store word bounding boxes for pixel-accurate overlay
+                if words and text.strip():
+                    cell['word_boxes'] = [
+                        {
+                            'text': w.get('text', ''),
+                            'left': w['left'],
+                            'top': w['top'],
+                            'width': w['width'],
+                            'height': w['height'],
+                            'conf': w.get('conf', 0),
+                        }
+                        for w in words
+                        if w.get('text', '').strip()
+                    ]
+                cells.append(cell)
+
+    # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
+    narrow_tasks = []
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            if col_idx in narrow_col_indices:
+                narrow_tasks.append((row_idx, col_idx, row, col))
+
+    if narrow_tasks:
+        max_workers = 4 if engine_name == "tesseract" else 2
+        with ThreadPoolExecutor(max_workers=max_workers) as pool:
+            futures = {
+                pool.submit(
+                    _ocr_cell_crop,
+                    ri, ci, row, col,
+                    ocr_img, img_bgr, img_w, img_h,
+                    engine_name, lang, lang_map,
+                ): (ri, ci)
+                for ri, ci, row, col in narrow_tasks
+            }
+            for future in as_completed(futures):
+                try:
+                    cell = future.result()
+                    cells.append(cell)
+                except Exception as e:
+                    ri, ci = futures[future]
+                    logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
+
+    # Sort cells by (row_index, col_index)
+    cells.sort(key=lambda c: (c['row_index'], c['col_index']))
+
+    # Remove all-empty rows
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
+
+    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
+                f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+                f"engine={engine_name} (hybrid)")
+
+    return cells, columns_meta
diff --git a/klausur-service/backend/cv_cell_grid_helpers.py b/klausur-service/backend/cv_cell_grid_helpers.py
new file mode 100644
index 0000000..f5e41d3
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_helpers.py
@@ -0,0 +1,136 @@
+"""
+Shared helpers for cell-grid construction (v2 + legacy).
+
+Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
+cv_cell_grid_legacy.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List
+
+import numpy as np
+
+from cv_vocab_types import RowGeometry
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+# Minimum OCR word confidence to keep (used across multiple functions)
+_MIN_WORD_CONF = 30
+
+
+def _compute_cell_padding(col_width: int, img_w: int) -> int:
+    """Adaptive padding for OCR crops based on column width.
+
+    Narrow columns (page_ref, marker) need more surrounding context so
+    Tesseract can segment characters correctly.  Wide columns keep the
+    minimal 4 px padding to avoid pulling in neighbours.
+    """
+    col_pct = col_width / img_w * 100 if img_w > 0 else 100
+    if col_pct < 5:
+        return max(20, col_width // 2)
+    if col_pct < 10:
+        return max(12, col_width // 4)
+    if col_pct < 15:
+        return 8
+    return 4
+
+
+def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
+                               max_scale: int = 3) -> np.ndarray:
+    """Upscale tiny crops so Tesseract gets enough pixel data.
+
+    If either dimension is below *min_dim*, the crop is bicubic-upscaled
+    so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
+    """
+    h, w = crop.shape[:2]
+    if h >= min_dim and w >= min_dim:
+        return crop
+    scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
+    if scale <= 1.0:
+        return crop
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+
+def _select_psm_for_column(col_type: str, col_width: int,
+                            row_height: int) -> int:
+    """Choose the best Tesseract PSM for a given column geometry.
+
+    - page_ref columns are almost always single short tokens -> PSM 8
+    - Very narrow or short cells -> PSM 7 (single text line)
+    - Everything else -> PSM 6 (uniform block)
+    """
+    if col_type in ('page_ref', 'marker'):
+        return 8  # single word
+    if col_width < 100 or row_height < 30:
+        return 7  # single line
+    return 6  # uniform block
+
+
+def _is_artifact_row(row: RowGeometry) -> bool:
+    """Return True if this row contains only scan artifacts, not real text.
+
+    Artifact rows (scanner shadows, noise) typically produce only single-character
+    detections. A real content row always has at least one token with 2+ characters.
+    """
+    if row.word_count == 0:
+        return True
+    texts = [w.get('text', '').strip() for w in row.words]
+    return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+    rows: List[RowGeometry],
+    top_bound: int,
+    bottom_bound: int,
+) -> None:
+    """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+    After filtering out empty or artifact rows, remaining content rows may have
+    gaps between them where the removed rows used to be. This function mutates
+    each row to extend upward/downward to the midpoint of such gaps so that
+    OCR crops cover the full available content area.
+
+    The first row always extends to top_bound; the last row to bottom_bound.
+    """
+    if not rows:
+        return
+    rows.sort(key=lambda r: r.y)
+    n = len(rows)
+    orig = [(r.y, r.y + r.height) for r in rows]  # snapshot before mutation
+
+    for i, row in enumerate(rows):
+        # New top: midpoint between previous row's bottom and this row's top
+        if i == 0:
+            new_top = top_bound
+        else:
+            prev_bot = orig[i - 1][1]
+            my_top = orig[i][0]
+            gap = my_top - prev_bot
+            new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+        # New bottom: midpoint between this row's bottom and next row's top
+        if i == n - 1:
+            new_bottom = bottom_bound
+        else:
+            my_bot = orig[i][1]
+            next_top = orig[i + 1][0]
+            gap = next_top - my_bot
+            new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+        row.y = new_top
+        row.height = max(5, new_bottom - new_top)
+
+    logger.debug(
+        f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+        f"(bounds: top={top_bound}, bottom={bottom_bound})"
+    )
diff --git a/klausur-service/backend/cv_cell_grid_legacy.py b/klausur-service/backend/cv_cell_grid_legacy.py
new file mode 100644
index 0000000..e00df7c
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_legacy.py
@@ -0,0 +1,436 @@
+"""
+Legacy cell-grid construction (v1) -- DEPRECATED, kept for backward compat.
+
+Extracted from cv_cell_grid.py. Prefer build_cell_grid_v2 for new code.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+    _clean_cell_text,
+    _words_to_reading_order_text,
+    ocr_region_lighton,
+    ocr_region_rapid,
+    ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+    _MIN_WORD_CONF,
+    _compute_cell_padding,
+    _ensure_minimum_crop_size,
+    _heal_row_gaps,
+    _is_artifact_row,
+    _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# _ocr_single_cell — legacy per-cell OCR with multi-level fallback
+# ---------------------------------------------------------------------------
+
+def _ocr_single_cell(
+    row_idx: int,
+    col_idx: int,
+    row: RowGeometry,
+    col: PageRegion,
+    ocr_img: np.ndarray,
+    img_bgr: Optional[np.ndarray],
+    img_w: int,
+    img_h: int,
+    use_rapid: bool,
+    engine_name: str,
+    lang: str,
+    lang_map: Dict[str, str],
+    preassigned_words: Optional[List[Dict]] = None,
+) -> Dict[str, Any]:
+    """Populate a single cell (column x row intersection) via word lookup."""
+    # Display bbox: exact column x row intersection (no padding)
+    disp_x = col.x
+    disp_y = row.y
+    disp_w = col.width
+    disp_h = row.height
+
+    # OCR crop: adaptive padding -- narrow columns get more context
+    pad = _compute_cell_padding(col.width, img_w)
+    cell_x = max(0, col.x - pad)
+    cell_y = max(0, row.y - pad)
+    cell_w = min(col.width + 2 * pad, img_w - cell_x)
+    cell_h = min(row.height + 2 * pad, img_h - cell_y)
+    is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
+
+    if disp_w <= 0 or disp_h <= 0:
+        return {
+            'cell_id': f"R{row_idx:02d}_C{col_idx}",
+            'row_index': row_idx,
+            'col_index': col_idx,
+            'col_type': col.type,
+            'text': '',
+            'confidence': 0.0,
+            'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
+            'bbox_pct': {
+                'x': round(col.x / img_w * 100, 2),
+                'y': round(row.y / img_h * 100, 2),
+                'w': round(col.width / img_w * 100, 2),
+                'h': round(row.height / img_h * 100, 2),
+            },
+            'ocr_engine': 'word_lookup',
+        }
+
+    # --- PRIMARY: Word-lookup from full-page Tesseract ---
+    words = preassigned_words if preassigned_words is not None else []
+    used_engine = 'word_lookup'
+
+    # Filter low-confidence words
+    if words:
+        words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+    if words:
+        y_tol = max(15, row.height)
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+    else:
+        text = ''
+        avg_conf = 0.0
+
+    # --- FALLBACK: Cell-OCR for empty cells ---
+    _run_fallback = False
+    if not text.strip() and cell_w > 0 and cell_h > 0:
+        if ocr_img is not None:
+            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            if crop.size > 0:
+                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                _run_fallback = dark_ratio > 0.005
+    if _run_fallback:
+        # For narrow columns, upscale the crop before OCR
+        if is_narrow and ocr_img is not None:
+            _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            _upscaled = _ensure_minimum_crop_size(_crop_slice)
+            if _upscaled is not _crop_slice:
+                _up_h, _up_w = _upscaled.shape[:2]
+                _tmp_region = PageRegion(
+                    type=col.type, x=0, y=0, width=_up_w, height=_up_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(_upscaled, _tmp_region,
+                                            lang=cell_lang, psm=_cell_psm)
+                # Remap word positions back to original image coordinates
+                _sx = cell_w / max(_up_w, 1)
+                _sy = cell_h / max(_up_h, 1)
+                for _fw in (fallback_words or []):
+                    _fw['left'] = int(_fw['left'] * _sx) + cell_x
+                    _fw['top'] = int(_fw['top'] * _sy) + cell_y
+                    _fw['width'] = int(_fw['width'] * _sx)
+                    _fw['height'] = int(_fw['height'] * _sy)
+            else:
+                cell_region = PageRegion(
+                    type=col.type, x=cell_x, y=cell_y,
+                    width=cell_w, height=cell_h,
+                )
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+        else:
+            cell_region = PageRegion(
+                type=col.type,
+                x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+                fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
+            elif engine_name == "lighton" and img_bgr is not None:
+                fallback_words = ocr_region_lighton(img_bgr, cell_region)
+            elif use_rapid and img_bgr is not None:
+                fallback_words = ocr_region_rapid(img_bgr, cell_region)
+            else:
+                _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+                cell_lang = lang_map.get(col.type, lang)
+                fallback_words = ocr_region(ocr_img, cell_region,
+                                            lang=cell_lang, psm=_cell_psm)
+
+        if fallback_words:
+            fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+        if fallback_words:
+            fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
+            fb_y_tol = max(10, int(fb_avg_h * 0.5))
+            fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
+            if fb_text.strip():
+                text = fb_text
+                avg_conf = round(
+                    sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
+                )
+                used_engine = 'cell_ocr_fallback'
+
+        # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+        if not text.strip() and _run_fallback and not use_rapid:
+            _fb_region = PageRegion(
+                type=col.type, x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+            cell_lang = lang_map.get(col.type, lang)
+            psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
+            if psm7_words:
+                psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+            if psm7_words:
+                p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+                if p7_text.strip():
+                    text = p7_text
+                    avg_conf = round(
+                        sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+                    )
+                    used_engine = 'cell_ocr_psm7'
+
+        # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
+        if not text.strip() and is_narrow and img_bgr is not None:
+            row_region = PageRegion(
+                type='_row_strip', x=0, y=row.y,
+                width=img_w, height=row.height,
+            )
+            strip_words = ocr_region_rapid(img_bgr, row_region)
+            if strip_words:
+                col_left = col.x
+                col_right = col.x + col.width
+                col_words = []
+                for sw in strip_words:
+                    sw_left = sw.get('left', 0)
+                    sw_right = sw_left + sw.get('width', 0)
+                    overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
+                    if overlap > sw.get('width', 1) * 0.3:
+                        col_words.append(sw)
+                if col_words:
+                    col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+                if col_words:
+                    rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
+                    if rs_text.strip():
+                        text = rs_text
+                        avg_conf = round(
+                            sum(w['conf'] for w in col_words) / len(col_words), 1
+                        )
+                        used_engine = 'row_strip_rapid'
+
+    # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+    if text.strip():
+        text = _clean_cell_text(text)
+        if not text:
+            avg_conf = 0.0
+
+    return {
+        'cell_id': f"R{row_idx:02d}_C{col_idx}",
+        'row_index': row_idx,
+        'col_index': col_idx,
+        'col_type': col.type,
+        'text': text,
+        'confidence': avg_conf,
+        'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+        'bbox_pct': {
+            'x': round(disp_x / img_w * 100, 2),
+            'y': round(disp_y / img_h * 100, 2),
+            'w': round(disp_w / img_w * 100, 2),
+            'h': round(disp_h / img_h * 100, 2),
+        },
+        'ocr_engine': used_engine,
+    }
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid — legacy grid builder (DEPRECATED)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Generic Cell-Grid: Columns x Rows -> cells with OCR text.
+
+    DEPRECATED: Use build_cell_grid_v2 instead.
+    """
+    # Resolve engine choice
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
+
+    # Filter to content rows only (skip header/footer)
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows found")
+        return [], []
+
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows with words found")
+        return [], []
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        logger.warning("build_cell_grid: no usable columns found")
+        return [], []
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
+    if not content_rows:
+        logger.warning("build_cell_grid: no content rows after artifact filtering")
+        return [], []
+
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    cells: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            cells.append(cell)
+
+    # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+    empty_by_col: Dict[int, List[int]] = {}
+    for ci, cell in enumerate(cells):
+        if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+            bpx = cell['bbox_px']
+            x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+            if w > 0 and h > 0 and ocr_img is not None:
+                crop = ocr_img[y:y + h, x:x + w]
+                if crop.size > 0:
+                    dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                    if dark_ratio > 0.005:
+                        empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+    for col_idx, cell_indices in empty_by_col.items():
+        if len(cell_indices) < 3:
+            continue
+
+        min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+        max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+        col_x = cells[cell_indices[0]]['bbox_px']['x']
+        col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+        strip_region = PageRegion(
+            type=relevant_cols[col_idx].type,
+            x=col_x, y=min_y,
+            width=col_w, height=max_y_h - min_y,
+        )
+        strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+        if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+            strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
+        elif engine_name == "lighton" and img_bgr is not None:
+            strip_words = ocr_region_lighton(img_bgr, strip_region)
+        elif use_rapid and img_bgr is not None:
+            strip_words = ocr_region_rapid(img_bgr, strip_region)
+        else:
+            strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+        if not strip_words:
+            continue
+
+        strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+        if not strip_words:
+            continue
+
+        for ci in cell_indices:
+            cell_y = cells[ci]['bbox_px']['y']
+            cell_h = cells[ci]['bbox_px']['h']
+            cell_mid_y = cell_y + cell_h / 2
+
+            matched_words = [
+                w for w in strip_words
+                if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+            ]
+            if matched_words:
+                matched_words.sort(key=lambda w: w['left'])
+                batch_text = ' '.join(w['text'] for w in matched_words)
+                batch_text = _clean_cell_text(batch_text)
+                if batch_text.strip():
+                    cells[ci]['text'] = batch_text
+                    cells[ci]['confidence'] = round(
+                        sum(w['conf'] for w in matched_words) / len(matched_words), 1
+                    )
+                    cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+        batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+        if batch_filled > 0:
+            logger.info(
+                f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+                f"empty cells in column {col_idx}"
+            )
+
+    # Remove all-empty rows
+    rows_with_text: set = set()
+    for cell in cells:
+        if cell['text'].strip():
+            rows_with_text.add(cell['row_index'])
+    before_filter = len(cells)
+    cells = [c for c in cells if c['row_index'] in rows_with_text]
+    empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+    if empty_rows_removed > 0:
+        logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
+
+    logger.info(f"build_cell_grid: {len(cells)} cells from "
+                f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+                f"engine={engine_name}")
+
+    return cells, columns_meta
diff --git a/klausur-service/backend/cv_cell_grid_merge.py b/klausur-service/backend/cv_cell_grid_merge.py
new file mode 100644
index 0000000..a86770e
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_merge.py
@@ -0,0 +1,235 @@
+"""
+Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_ocr_engines import _RE_ALPHA
+
+logger = logging.getLogger(__name__)
+
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+    """Check if text consists only of phonetic transcription.
+
+    Phonetic-only patterns:
+      ['mani serva]   ->  True
+      [dance]         ->  True
+      ["a:mand]       ->  True
+      almond ['a:mand] -> False (has real word before bracket)
+      Mandel           -> False
+    """
+    t = text.strip()
+    if not t:
+        return False
+    # Must contain at least one bracket
+    if '[' not in t and ']' not in t:
+        return False
+    # Remove all bracket content and surrounding punctuation/whitespace
+    without_brackets = re.sub(r"\[.*?\]", '', t)
+    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+    # If nothing meaningful remains, it's phonetic-only
+    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+    return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows that contain only phonetic transcription into previous entry.
+
+    In dictionary pages, phonetic transcription sometimes wraps to the next
+    row.  E.g.:
+      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
+      Row 29: EN="['mani serva]"       DE=""
+
+    Row 29 is phonetic-only -> merge into row 28's EN field.
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+        if merged and _is_phonetic_only_text(en) and not de:
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            # Append phonetic to previous entry's EN
+            if prev_en:
+                prev['english'] = prev_en + ' ' + en
+            else:
+                prev['english'] = en
+            # If there was an example, append to previous too
+            if ex:
+                prev_ex = (prev.get('example') or '').strip()
+                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+            logger.debug(
+                f"Merged phonetic row {entry.get('row_index')} "
+                f"into previous entry: {prev['english']!r}"
+            )
+            continue
+
+        merged.append(entry)
+
+    return merged
+
+
+def _merge_wrapped_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows where the primary column (EN) is empty -- cell wrap continuation.
+
+    In textbook vocabulary tables, columns are often narrow, so the author
+    wraps text within a cell. OCR treats each physical line as a separate row.
+    The key indicator: if the EN column is empty but DE/example have text,
+    this row is a continuation of the previous row's cells.
+
+    Example (original textbook has ONE row):
+      Row 2: EN="take part (in)"  DE="teilnehmen (an), mitmachen"  EX="More than 200 singers took"
+      Row 3: EN=""                DE="(bei)"                        EX="part in the concert."
+      -> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
+
+    Also handles the reverse case: DE empty but EN has text (wrap in EN column).
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        if not merged:
+            merged.append(entry)
+            continue
+
+        prev = merged[-1]
+        prev_en = (prev.get('english') or '').strip()
+        prev_de = (prev.get('german') or '').strip()
+        prev_ex = (prev.get('example') or '').strip()
+
+        # Case 1: EN is empty -> continuation of previous row
+        if not en and (de or ex) and prev_en:
+            if de:
+                if prev_de.endswith(','):
+                    sep = ' '
+                elif prev_de.endswith(('-', '(')):
+                    sep = ''
+                else:
+                    sep = ' '
+                prev['german'] = (prev_de + sep + de).strip()
+            if ex:
+                sep = ' ' if prev_ex else ''
+                prev['example'] = (prev_ex + sep + ex).strip()
+            logger.debug(
+                f"Merged wrapped row {entry.get('row_index')} into previous "
+                f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
+            )
+            continue
+
+        # Case 2: DE is empty, EN has text that looks like continuation
+        if en and not de and prev_de:
+            is_paren = en.startswith('(')
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            if (is_paren or starts_lower) and len(en.split()) < 5:
+                sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
+                prev['english'] = (prev_en + sep + en).strip()
+                if ex:
+                    sep2 = ' ' if prev_ex else ''
+                    prev['example'] = (prev_ex + sep2 + ex).strip()
+                logger.debug(
+                    f"Merged wrapped row {entry.get('row_index')} into previous "
+                    f"(empty DE): EN={prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    if len(merged) < len(entries):
+        logger.info(
+            f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
+            f"continuation rows ({len(entries)} -> {len(merged)})"
+        )
+    return merged
+
+
+def _merge_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge multi-line vocabulary entries where text wraps to the next row.
+
+    A row is a continuation of the previous entry when:
+    - EN has text, but DE is empty
+    - EN starts with a lowercase letter (not a new vocab entry)
+    - Previous entry's EN does NOT end with a sentence terminator (.!?)
+    - The continuation text has fewer than 4 words (not an example sentence)
+    - The row was not already merged as phonetic
+
+    Example:
+      Row 5: EN="to put up"       DE="aufstellen"
+      Row 6: EN="with sth."       DE=""
+      -> Merged: EN="to put up with sth."  DE="aufstellen"
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+
+        if merged and en and not de:
+            # Check: not phonetic (already handled)
+            if _is_phonetic_only_text(en):
+                merged.append(entry)
+                continue
+
+            # Check: starts with lowercase
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            # Check: fewer than 4 words (not an example sentence)
+            word_count = len(en.split())
+            is_short = word_count < 4
+
+            # Check: previous entry doesn't end with sentence terminator
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+            if starts_lower and is_short and not prev_ends_sentence:
+                # Merge into previous entry
+                prev['english'] = (prev_en + ' ' + en).strip()
+                # Merge example if present
+                ex = (entry.get('example') or '').strip()
+                if ex:
+                    prev_ex = (prev.get('example') or '').strip()
+                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+                logger.debug(
+                    f"Merged continuation row {entry.get('row_index')} "
+                    f"into previous entry: {prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    return merged
diff --git a/klausur-service/backend/cv_cell_grid_streaming.py b/klausur-service/backend/cv_cell_grid_streaming.py
new file mode 100644
index 0000000..4db3268
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_streaming.py
@@ -0,0 +1,217 @@
+"""
+Streaming variants of cell-grid builders (v2 + legacy).
+
+Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
+useful for progress reporting.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+    RAPIDOCR_AVAILABLE,
+    _assign_row_words_to_columns,
+)
+from cv_cell_grid_helpers import (
+    _heal_row_gaps,
+    _is_artifact_row,
+)
+from cv_cell_grid_build import _ocr_cell_crop
+from cv_cell_grid_legacy import _ocr_single_cell
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2_streaming
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells)
+    """
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        engine_name = "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+                   'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    if not content_rows:
+        return
+
+    # Use header/footer boundaries for heal_row_gaps
+    content_rows.sort(key=lambda r: r.y)
+    header_rows = [r for r in row_geometries if r.row_type == 'header']
+    footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+    if header_rows:
+        top_bound = max(r.y + r.height for r in header_rows)
+    else:
+        top_bound = content_rows[0].y
+    if footer_rows:
+        bottom_bound = min(r.y for r in footer_rows)
+    else:
+        bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+    _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+        for ci, c in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_cell_crop(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                engine_name, lang, lang_map,
+            )
+            yield cell, columns_meta, total_cells
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_streaming — legacy streaming variant
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_streaming(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+    """Like build_cell_grid(), but yields each cell as it is OCR'd.
+
+    DEPRECATED: Use build_cell_grid_v2_streaming instead.
+
+    Yields:
+        (cell_dict, columns_meta, total_cells) for each cell.
+    """
+    use_rapid = False
+    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+        engine_name = ocr_engine
+    elif ocr_engine == "auto":
+        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "rapid" if use_rapid else "tesseract"
+    elif ocr_engine == "rapid":
+        if not RAPIDOCR_AVAILABLE:
+            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+        else:
+            use_rapid = True
+        engine_name = "rapid" if use_rapid else "tesseract"
+    else:
+        engine_name = "tesseract"
+
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        return
+
+    before = len(content_rows)
+    content_rows = [r for r in content_rows if r.word_count > 0]
+    skipped = before - len(content_rows)
+    if skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
+    if not content_rows:
+        return
+
+    _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+    if not relevant_cols:
+        return
+
+    before_art = len(content_rows)
+    content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+    artifact_skipped = before_art - len(content_rows)
+    if artifact_skipped > 0:
+        logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
+    if not content_rows:
+        return
+    _heal_row_gaps(
+        content_rows,
+        top_bound=min(c.y for c in relevant_cols),
+        bottom_bound=max(c.y + c.height for c in relevant_cols),
+    )
+
+    relevant_cols.sort(key=lambda c: c.x)
+
+    columns_meta = [
+        {
+            'index': col_idx,
+            'type': col.type,
+            'x': col.x,
+            'width': col.width,
+        }
+        for col_idx, col in enumerate(relevant_cols)
+    ]
+
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    total_cells = len(content_rows) * len(relevant_cols)
+
+    for row_idx, row in enumerate(content_rows):
+        col_words = _assign_row_words_to_columns(row, relevant_cols)
+        for col_idx, col in enumerate(relevant_cols):
+            cell = _ocr_single_cell(
+                row_idx, col_idx, row, col,
+                ocr_img, img_bgr, img_w, img_h,
+                use_rapid, engine_name, lang, lang_map,
+                preassigned_words=col_words[col_idx],
+            )
+            yield cell, columns_meta, total_cells
diff --git a/klausur-service/backend/cv_cell_grid_vocab.py b/klausur-service/backend/cv_cell_grid_vocab.py
new file mode 100644
index 0000000..d475c33
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_vocab.py
@@ -0,0 +1,200 @@
+"""
+Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List
+
+from cv_ocr_engines import (
+    _attach_example_sentences,
+    _fix_phonetic_brackets,
+    _split_comma_entries,
+)
+from cv_cell_grid_legacy import build_cell_grid
+from cv_cell_grid_merge import (
+    _merge_continuation_rows,
+    _merge_phonetic_continuation_rows,
+    _merge_wrapped_rows,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _cells_to_vocab_entries(
+    cells: List[Dict[str, Any]],
+    columns_meta: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Map generic cells to vocab entries with english/german/example fields.
+
+    Groups cells by row_index, maps col_type -> field name, and produces
+    one entry per row (only rows with at least one non-empty field).
+    """
+    col_type_to_field = {
+        'column_en': 'english',
+        'column_de': 'german',
+        'column_example': 'example',
+        'page_ref': 'source_page',
+        'column_marker': 'marker',
+        'column_text': 'text',  # generic single-column (box sub-sessions)
+    }
+    bbox_key_map = {
+        'column_en': 'bbox_en',
+        'column_de': 'bbox_de',
+        'column_example': 'bbox_ex',
+        'page_ref': 'bbox_ref',
+        'column_marker': 'bbox_marker',
+        'column_text': 'bbox_text',
+    }
+
+    # Group cells by row_index
+    rows: Dict[int, List[Dict]] = {}
+    for cell in cells:
+        ri = cell['row_index']
+        rows.setdefault(ri, []).append(cell)
+
+    entries: List[Dict[str, Any]] = []
+    for row_idx in sorted(rows.keys()):
+        row_cells = rows[row_idx]
+        entry: Dict[str, Any] = {
+            'row_index': row_idx,
+            'english': '',
+            'german': '',
+            'example': '',
+            'text': '',  # generic single-column (box sub-sessions)
+            'source_page': '',
+            'marker': '',
+            'confidence': 0.0,
+            'bbox': None,
+            'bbox_en': None,
+            'bbox_de': None,
+            'bbox_ex': None,
+            'bbox_ref': None,
+            'bbox_marker': None,
+            'bbox_text': None,
+            'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
+        }
+
+        confidences = []
+        for cell in row_cells:
+            col_type = cell['col_type']
+            field = col_type_to_field.get(col_type)
+            if field:
+                entry[field] = cell['text']
+            bbox_field = bbox_key_map.get(col_type)
+            if bbox_field:
+                entry[bbox_field] = cell['bbox_pct']
+            if cell['confidence'] > 0:
+                confidences.append(cell['confidence'])
+
+        # Compute row-level bbox as union of all cell bboxes
+        all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
+        if all_bboxes:
+            min_x = min(b['x'] for b in all_bboxes)
+            min_y = min(b['y'] for b in all_bboxes)
+            max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
+            max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
+            entry['bbox'] = {
+                'x': round(min_x, 2),
+                'y': round(min_y, 2),
+                'w': round(max_x2 - min_x, 2),
+                'h': round(max_y2 - min_y, 2),
+            }
+
+        entry['confidence'] = round(
+            sum(confidences) / len(confidences), 1
+        ) if confidences else 0.0
+
+        # Only include if at least one mapped field has text
+        has_content = any(
+            entry.get(f)
+            for f in col_type_to_field.values()
+        )
+        if has_content:
+            entries.append(entry)
+
+    return entries
+
+
+def build_word_grid(
+    ocr_img,
+    column_regions,
+    row_geometries,
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+    ocr_engine: str = "auto",
+    img_bgr=None,
+    pronunciation: str = "british",
+) -> List[Dict[str, Any]]:
+    """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
+
+    Wrapper around build_cell_grid() that adds vocabulary-specific logic:
+    - Maps cells to english/german/example entries
+    - Applies character confusion fixes, IPA lookup, comma splitting, etc.
+    - Falls back to returning raw cells if no vocab columns detected.
+
+    Args:
+        ocr_img: Binarized full-page image (for Tesseract).
+        column_regions: Classified columns from Step 3.
+        row_geometries: Rows from Step 4.
+        img_w, img_h: Image dimensions.
+        lang: Default Tesseract language.
+        ocr_engine: 'tesseract', 'rapid', or 'auto'.
+        img_bgr: BGR color image (required for RapidOCR).
+        pronunciation: 'british' or 'american' for IPA lookup.
+
+    Returns:
+        List of entry dicts with english/german/example text and bbox info (percent).
+    """
+    cells, columns_meta = build_cell_grid(
+        ocr_img, column_regions, row_geometries, img_w, img_h,
+        lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
+    )
+
+    if not cells:
+        return []
+
+    # Check if vocab layout is present
+    col_types = {c['type'] for c in columns_meta}
+    if not (col_types & {'column_en', 'column_de'}):
+        logger.info("build_word_grid: no vocab columns -- returning raw cells")
+        return cells
+
+    # Vocab mapping: cells -> entries
+    entries = _cells_to_vocab_entries(cells, columns_meta)
+
+    # --- Post-processing pipeline (deterministic, no LLM) ---
+    n_raw = len(entries)
+
+    # 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
+    entries = _merge_wrapped_rows(entries)
+
+    # 0a. Merge phonetic-only continuation rows into previous entry
+    entries = _merge_phonetic_continuation_rows(entries)
+
+    # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+    entries = _merge_continuation_rows(entries)
+
+    # 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
+    #    llm_review_entries_streaming so changes are visible to the user in Step 6.
+
+    # 2. Replace OCR'd phonetics with dictionary IPA
+    entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+
+    # 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
+    entries = _split_comma_entries(entries)
+
+    # 4. Attach example sentences (rows without DE -> examples for preceding entry)
+    entries = _attach_example_sentences(entries)
+
+    engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
+    logger.info(f"build_word_grid: {len(entries)} entries from "
+                f"{n_raw} raw -> {len(entries)} after post-processing "
+                f"(engine={engine_name})")
+
+    return entries
diff --git a/klausur-service/backend/cv_preprocessing.py b/klausur-service/backend/cv_preprocessing.py
index 71c4f50..0cb2841 100644
--- a/klausur-service/backend/cv_preprocessing.py
+++ b/klausur-service/backend/cv_preprocessing.py
@@ -1,14 +1,19 @@
 """
 Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
 
+Re-export facade -- all logic lives in the sub-modules:
+
+  cv_preprocessing_deskew   Rotation correction (Hough, word-alignment, iterative, two-pass)
+  cv_preprocessing_dewarp   Vertical shear detection and correction (4 methods + ensemble)
+
+This file contains the image I/O and orientation detection functions.
+
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 
 import logging
-import time
-from collections import defaultdict
-from typing import Any, Dict, List, Tuple
+from typing import Tuple
 
 import numpy as np
 
@@ -19,7 +24,7 @@ from cv_vocab_types import (
 
 logger = logging.getLogger(__name__)
 
-# Guarded imports — mirror cv_vocab_types guards
+# Guarded imports
 try:
     import cv2
 except ImportError:
@@ -32,6 +37,33 @@ except ImportError:
     pytesseract = None  # type: ignore[assignment]
     Image = None  # type: ignore[assignment,misc]
 
+# Re-export all deskew functions
+from cv_preprocessing_deskew import (  # noqa: F401
+    deskew_image,
+    deskew_image_by_word_alignment,
+    deskew_image_iterative,
+    deskew_two_pass,
+    _projection_gradient_score,
+    _measure_textline_slope,
+)
+
+# Re-export all dewarp functions
+from cv_preprocessing_dewarp import (  # noqa: F401
+    _apply_shear,
+    _detect_shear_angle,
+    _detect_shear_by_hough,
+    _detect_shear_by_projection,
+    _detect_shear_by_text_lines,
+    _dewarp_quality_check,
+    _ensemble_shear,
+    dewarp_image,
+    dewarp_image_manual,
+)
+
+
+# =============================================================================
+# Image I/O
+# =============================================================================
 
 def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
     """Render a PDF page to a high-resolution numpy array (BGR).
@@ -54,7 +86,6 @@ def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0
     mat = fitz.Matrix(zoom, zoom)
     pix = page.get_pixmap(matrix=mat)
 
-    # Convert to numpy BGR
     img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
     if pix.n == 4:  # RGBA
         img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
@@ -84,23 +115,19 @@ def render_image_high_res(image_data: bytes) -> np.ndarray:
 
 
 # =============================================================================
-# Stage 1b: Orientation Detection (0°/90°/180°/270°)
+# Orientation Detection (0/90/180/270)
 # =============================================================================
 
 def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
     """Detect page orientation via Tesseract OSD and rotate if needed.
 
-    Handles upside-down scans (180°) common with book scanners where
-    every other page is flipped due to the scanner hinge.
-
     Returns:
-        (corrected_image, rotation_degrees)  — rotation is 0, 90, 180, or 270.
+        (corrected_image, rotation_degrees) -- rotation is 0, 90, 180, or 270.
     """
     if pytesseract is None:
         return img_bgr, 0
 
     try:
-        # Tesseract OSD needs a grayscale or RGB image
         gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
         pil_img = Image.fromarray(gray)
 
@@ -108,12 +135,11 @@ def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
         rotate = osd.get("rotate", 0)
         confidence = osd.get("orientation_conf", 0.0)
 
-        logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}")
+        logger.info(f"OSD: orientation={rotate}\u00b0 confidence={confidence:.1f}")
 
         if rotate == 0 or confidence < 1.0:
             return img_bgr, 0
 
-        # Apply rotation — OSD rotate is the clockwise correction needed
         if rotate == 180:
             corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
         elif rotate == 90:
@@ -123,1044 +149,9 @@ def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
         else:
             return img_bgr, 0
 
-        logger.info(f"OSD: rotated {rotate}° to fix orientation")
+        logger.info(f"OSD: rotated {rotate}\u00b0 to fix orientation")
         return corrected, rotate
 
     except Exception as e:
         logger.warning(f"OSD orientation detection failed: {e}")
         return img_bgr, 0
-
-
-# =============================================================================
-# Stage 2: Deskew (Rotation Correction)
-# =============================================================================
-
-def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
-    """Correct rotation using Hough Line detection.
-
-    Args:
-        img: BGR image.
-
-    Returns:
-        Tuple of (corrected image, detected angle in degrees).
-    """
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    # Binarize for line detection
-    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
-    # Detect lines
-    lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
-                            minLineLength=img.shape[1] // 4, maxLineGap=20)
-
-    if lines is None or len(lines) < 3:
-        return img, 0.0
-
-    # Compute angles of near-horizontal lines
-    angles = []
-    for line in lines:
-        x1, y1, x2, y2 = line[0]
-        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
-        if abs(angle) < 15:  # Only near-horizontal
-            angles.append(angle)
-
-    if not angles:
-        return img, 0.0
-
-    median_angle = float(np.median(angles))
-
-    # Limit correction to ±5°
-    if abs(median_angle) > 5.0:
-        median_angle = 5.0 * np.sign(median_angle)
-
-    if abs(median_angle) < 0.1:
-        return img, 0.0
-
-    # Rotate
-    h, w = img.shape[:2]
-    center = (w // 2, h // 2)
-    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
-    corrected = cv2.warpAffine(img, M, (w, h),
-                               flags=cv2.INTER_LINEAR,
-                               borderMode=cv2.BORDER_REPLICATE)
-
-    logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
-    return corrected, median_angle
-
-
-def deskew_image_by_word_alignment(
-    image_data: bytes,
-    lang: str = "eng+deu",
-    downscale_factor: float = 0.5,
-) -> Tuple[bytes, float]:
-    """Correct rotation by fitting a line through left-most word starts per text line.
-
-    More robust than Hough-based deskew for vocabulary worksheets where text lines
-    have consistent left-alignment.  Runs a quick Tesseract pass on a downscaled
-    copy to find word positions, computes the dominant left-edge column, fits a
-    line through those points and rotates the full-resolution image.
-
-    Args:
-        image_data: Raw image bytes (PNG/JPEG).
-        lang: Tesseract language string for the quick pass.
-        downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
-
-    Returns:
-        Tuple of (rotated image as PNG bytes, detected angle in degrees).
-    """
-    if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
-        return image_data, 0.0
-
-    # 1. Decode image
-    img_array = np.frombuffer(image_data, dtype=np.uint8)
-    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
-    if img is None:
-        logger.warning("deskew_by_word_alignment: could not decode image")
-        return image_data, 0.0
-
-    orig_h, orig_w = img.shape[:2]
-
-    # 2. Downscale for fast Tesseract pass
-    small_w = int(orig_w * downscale_factor)
-    small_h = int(orig_h * downscale_factor)
-    small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
-
-    # 3. Quick Tesseract — word-level positions
-    pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
-    try:
-        data = pytesseract.image_to_data(
-            pil_small, lang=lang, config="--psm 6 --oem 3",
-            output_type=pytesseract.Output.DICT,
-        )
-    except Exception as e:
-        logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
-        return image_data, 0.0
-
-    # 4. Per text-line, find the left-most word start
-    #    Group by (block_num, par_num, line_num)
-    line_groups: Dict[tuple, list] = defaultdict(list)
-    for i in range(len(data["text"])):
-        text = (data["text"][i] or "").strip()
-        conf = int(data["conf"][i])
-        if not text or conf < 20:
-            continue
-        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
-        line_groups[key].append(i)
-
-    if len(line_groups) < 5:
-        logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
-        return image_data, 0.0
-
-    # For each line, pick the word with smallest 'left' → compute (left_x, center_y)
-    # Scale back to original resolution
-    scale = 1.0 / downscale_factor
-    points = []  # list of (x, y) in original-image coords
-    for key, indices in line_groups.items():
-        best_idx = min(indices, key=lambda i: data["left"][i])
-        lx = data["left"][best_idx] * scale
-        top = data["top"][best_idx] * scale
-        h = data["height"][best_idx] * scale
-        cy = top + h / 2.0
-        points.append((lx, cy))
-
-    # 5. Find dominant left-edge column + compute angle
-    xs = np.array([p[0] for p in points])
-    ys = np.array([p[1] for p in points])
-    median_x = float(np.median(xs))
-    tolerance = orig_w * 0.03  # 3% of image width
-
-    mask = np.abs(xs - median_x) <= tolerance
-    filtered_xs = xs[mask]
-    filtered_ys = ys[mask]
-
-    if len(filtered_xs) < 5:
-        logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
-        return image_data, 0.0
-
-    # polyfit: x = a*y + b  →  a = dx/dy  →  angle = arctan(a)
-    coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
-    slope = coeffs[0]  # dx/dy
-    angle_rad = np.arctan(slope)
-    angle_deg = float(np.degrees(angle_rad))
-
-    # Clamp to ±5°
-    angle_deg = max(-5.0, min(5.0, angle_deg))
-
-    logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
-                f"(total lines: {len(line_groups)})")
-
-    if abs(angle_deg) < 0.05:
-        return image_data, 0.0
-
-    # 6. Rotate full-res image
-    center = (orig_w // 2, orig_h // 2)
-    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
-    rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
-                              flags=cv2.INTER_LINEAR,
-                              borderMode=cv2.BORDER_REPLICATE)
-
-    # Encode back to PNG
-    success, png_buf = cv2.imencode(".png", rotated)
-    if not success:
-        logger.warning("deskew_by_word_alignment: PNG encoding failed")
-        return image_data, 0.0
-
-    return png_buf.tobytes(), angle_deg
-
-
-def _projection_gradient_score(profile: np.ndarray) -> float:
-    """Score a projection profile by the L2-norm of its first derivative.
-
-    Higher score = sharper transitions between text-lines and gaps,
-    i.e. better row/column alignment.
-    """
-    diff = np.diff(profile)
-    return float(np.sum(diff * diff))
-
-
-def deskew_image_iterative(
-    img: np.ndarray,
-    coarse_range: float = 5.0,
-    coarse_step: float = 0.1,
-    fine_range: float = 0.15,
-    fine_step: float = 0.02,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
-    """Iterative deskew using vertical-edge projection optimisation.
-
-    The key insight: at the correct rotation angle, vertical features
-    (word left-edges, column borders) become truly vertical, producing
-    the sharpest peaks in the vertical projection of vertical edges.
-
-    Method:
-      1. Detect vertical edges via Sobel-X on the central crop.
-      2. Coarse sweep: rotate edge image, compute vertical projection
-         gradient score.  The angle where vertical edges align best wins.
-      3. Fine sweep: refine around the coarse winner.
-
-    Args:
-        img: BGR image (full resolution).
-        coarse_range: half-range in degrees for the coarse sweep.
-        coarse_step: step size in degrees for the coarse sweep.
-        fine_range: half-range around the coarse winner for the fine sweep.
-        fine_step: step size in degrees for the fine sweep.
-
-    Returns:
-        (rotated_bgr, angle_degrees, debug_dict)
-    """
-    h, w = img.shape[:2]
-    debug: Dict[str, Any] = {}
-
-    # --- Grayscale + vertical edge detection ---
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    # Central crop (15%-85% height, 10%-90% width) to avoid page margins
-    y_lo, y_hi = int(h * 0.15), int(h * 0.85)
-    x_lo, x_hi = int(w * 0.10), int(w * 0.90)
-    gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
-
-    # Sobel-X → absolute vertical edges
-    sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
-    edges = np.abs(sobel_x)
-    # Normalise to 0-255 for consistent scoring
-    edge_max = edges.max()
-    if edge_max > 0:
-        edges = (edges / edge_max * 255).astype(np.uint8)
-    else:
-        return img, 0.0, {"error": "no edges detected"}
-
-    crop_h, crop_w = edges.shape[:2]
-    crop_center = (crop_w // 2, crop_h // 2)
-
-    # Trim margin after rotation to avoid border artifacts
-    trim_y = max(4, int(crop_h * 0.03))
-    trim_x = max(4, int(crop_w * 0.03))
-
-    def _sweep_edges(angles: np.ndarray) -> list:
-        """Score each angle by vertical projection gradient of vertical edges."""
-        results = []
-        for angle in angles:
-            if abs(angle) < 1e-6:
-                rotated = edges
-            else:
-                M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
-                rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
-                                         flags=cv2.INTER_NEAREST,
-                                         borderMode=cv2.BORDER_REPLICATE)
-            # Trim borders to avoid edge artifacts
-            trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
-            v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
-            score = _projection_gradient_score(v_profile)
-            results.append((float(angle), score))
-        return results
-
-    # --- Phase 1: coarse sweep ---
-    coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
-    coarse_results = _sweep_edges(coarse_angles)
-    best_coarse = max(coarse_results, key=lambda x: x[1])
-    best_coarse_angle, best_coarse_score = best_coarse
-
-    debug["coarse_best_angle"] = round(best_coarse_angle, 2)
-    debug["coarse_best_score"] = round(best_coarse_score, 1)
-    debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
-
-    # --- Phase 2: fine sweep around coarse winner ---
-    fine_lo = best_coarse_angle - fine_range
-    fine_hi = best_coarse_angle + fine_range
-    fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
-    fine_results = _sweep_edges(fine_angles)
-    best_fine = max(fine_results, key=lambda x: x[1])
-    best_fine_angle, best_fine_score = best_fine
-
-    debug["fine_best_angle"] = round(best_fine_angle, 2)
-    debug["fine_best_score"] = round(best_fine_score, 1)
-    debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
-
-    final_angle = best_fine_angle
-
-    # Clamp to ±5°
-    final_angle = max(-5.0, min(5.0, final_angle))
-
-    logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°")
-
-    if abs(final_angle) < 0.05:
-        return img, 0.0, debug
-
-    # --- Rotate full-res image ---
-    center = (w // 2, h // 2)
-    M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
-    rotated = cv2.warpAffine(img, M, (w, h),
-                              flags=cv2.INTER_LINEAR,
-                              borderMode=cv2.BORDER_REPLICATE)
-
-    return rotated, final_angle, debug
-
-
-def _measure_textline_slope(img: np.ndarray) -> float:
-    """Measure residual text-line slope via Tesseract word-position regression.
-
-    Groups Tesseract words by (block, par, line), fits a linear regression
-    per line (y = slope * x + b), and returns the trimmed-mean slope in
-    degrees.  Positive = text rises to the right, negative = falls.
-
-    This is the most direct measurement of remaining rotation after deskew.
-    """
-    import math as _math
-
-    if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
-        return 0.0
-
-    h, w = img.shape[:2]
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-    data = pytesseract.image_to_data(
-        Image.fromarray(gray),
-        output_type=pytesseract.Output.DICT,
-        config="--psm 6",
-    )
-
-    # Group word centres by text line
-    lines: Dict[tuple, list] = {}
-    for i in range(len(data["text"])):
-        txt = (data["text"][i] or "").strip()
-        if len(txt) < 2 or int(data["conf"][i]) < 30:
-            continue
-        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
-        cx = data["left"][i] + data["width"][i] / 2.0
-        cy = data["top"][i] + data["height"][i] / 2.0
-        lines.setdefault(key, []).append((cx, cy))
-
-    # Per-line linear regression → slope angle
-    slopes: list = []
-    for pts in lines.values():
-        if len(pts) < 3:
-            continue
-        pts.sort(key=lambda p: p[0])
-        xs = np.array([p[0] for p in pts], dtype=np.float64)
-        ys = np.array([p[1] for p in pts], dtype=np.float64)
-        if xs[-1] - xs[0] < w * 0.15:
-            continue  # skip short lines
-        A = np.vstack([xs, np.ones_like(xs)]).T
-        result = np.linalg.lstsq(A, ys, rcond=None)
-        slope = result[0][0]
-        slopes.append(_math.degrees(_math.atan(slope)))
-
-    if len(slopes) < 3:
-        return 0.0
-
-    # Trimmed mean (drop 10% extremes on each side)
-    slopes.sort()
-    trim = max(1, len(slopes) // 10)
-    trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
-    if not trimmed:
-        return 0.0
-
-    return sum(trimmed) / len(trimmed)
-
-
-def deskew_two_pass(
-    img: np.ndarray,
-    coarse_range: float = 5.0,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
-    """Two-pass deskew: iterative projection + word-alignment residual check.
-
-    Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range).
-    Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image
-             to detect and fix residual skew that the projection method missed.
-
-    The two corrections are summed.  If the residual from Pass 2 is below
-    0.3° it is ignored (already good enough).
-
-    Returns:
-        (corrected_bgr, total_angle_degrees, debug_dict)
-    """
-    debug: Dict[str, Any] = {}
-
-    # --- Pass 1: iterative projection ---
-    corrected, angle1, dbg1 = deskew_image_iterative(
-        img.copy(), coarse_range=coarse_range,
-    )
-    debug["pass1_angle"] = round(angle1, 3)
-    debug["pass1_method"] = "iterative"
-    debug["pass1_debug"] = dbg1
-
-    # --- Pass 2: word-alignment residual check on corrected image ---
-    angle2 = 0.0
-    try:
-        # Encode the corrected image to PNG bytes for word-alignment
-        ok, buf = cv2.imencode(".png", corrected)
-        if ok:
-            corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
-            if abs(angle2) >= 0.3:
-                # Significant residual — decode and use the second correction
-                arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
-                corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
-                if corrected2 is not None:
-                    corrected = corrected2
-                    logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied "
-                                f"(total={angle1 + angle2:.2f}°)")
-                else:
-                    angle2 = 0.0
-            else:
-                logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped")
-                angle2 = 0.0
-    except Exception as e:
-        logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
-        angle2 = 0.0
-
-    # --- Pass 3: Tesseract text-line regression residual check ---
-    # The most reliable final check: measure actual text-line slopes
-    # using Tesseract word positions and linear regression per line.
-    angle3 = 0.0
-    try:
-        residual = _measure_textline_slope(corrected)
-        debug["pass3_raw"] = round(residual, 3)
-        if abs(residual) >= 0.3:
-            h3, w3 = corrected.shape[:2]
-            center3 = (w3 // 2, h3 // 2)
-            M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
-            corrected = cv2.warpAffine(
-                corrected, M3, (w3, h3),
-                flags=cv2.INTER_LINEAR,
-                borderMode=cv2.BORDER_REPLICATE,
-            )
-            angle3 = residual
-            logger.info(
-                "deskew_two_pass: pass3 text-line residual=%.2f° applied",
-                residual,
-            )
-        else:
-            logger.info(
-                "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped",
-                residual,
-            )
-    except Exception as e:
-        logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
-
-    total_angle = angle1 + angle2 + angle3
-    debug["pass2_angle"] = round(angle2, 3)
-    debug["pass2_method"] = "word_alignment"
-    debug["pass3_angle"] = round(angle3, 3)
-    debug["pass3_method"] = "textline_regression"
-    debug["total_angle"] = round(total_angle, 3)
-
-    logger.info(
-        "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°",
-        angle1, angle2, angle3, total_angle,
-    )
-
-    return corrected, total_angle, debug
-
-
-# =============================================================================
-# Stage 3: Dewarp (Book Curvature Correction)
-# =============================================================================
-
-def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
-    """Detect the vertical shear angle of the page.
-
-    After deskew (horizontal lines aligned), vertical features like column
-    edges may still be tilted. This measures that tilt by tracking the
-    strongest vertical edge across horizontal strips.
-
-    The result is a shear angle in degrees: the angular difference between
-    true vertical and the detected column edge.
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    h, w = img.shape[:2]
-    result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
-
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    # Vertical Sobel to find vertical edges
-    sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
-    abs_sobel = np.abs(sobel_x).astype(np.uint8)
-
-    # Binarize with Otsu
-    _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-
-    num_strips = 20
-    strip_h = h // num_strips
-    edge_positions = []  # (y_center, x_position)
-
-    for i in range(num_strips):
-        y_start = i * strip_h
-        y_end = min((i + 1) * strip_h, h)
-        strip = binary[y_start:y_end, :]
-
-        # Project vertically (sum along y-axis)
-        projection = np.sum(strip, axis=0).astype(np.float64)
-        if projection.max() == 0:
-            continue
-
-        # Find the strongest vertical edge in left 40% of image
-        search_w = int(w * 0.4)
-        left_proj = projection[:search_w]
-        if left_proj.max() == 0:
-            continue
-
-        # Smooth and find peak
-        kernel_size = max(3, w // 100)
-        if kernel_size % 2 == 0:
-            kernel_size += 1
-        smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
-        x_pos = float(np.argmax(smoothed))
-        y_center = (y_start + y_end) / 2.0
-        edge_positions.append((y_center, x_pos))
-
-    if len(edge_positions) < 8:
-        return result
-
-    ys = np.array([p[0] for p in edge_positions])
-    xs = np.array([p[1] for p in edge_positions])
-
-    # Remove outliers (> 2 std from median)
-    median_x = np.median(xs)
-    std_x = max(np.std(xs), 1.0)
-    mask = np.abs(xs - median_x) < 2 * std_x
-    ys = ys[mask]
-    xs = xs[mask]
-
-    if len(ys) < 6:
-        return result
-
-    # Fit straight line: x = slope * y + intercept
-    # The slope tells us the tilt of the vertical edge
-    straight_coeffs = np.polyfit(ys, xs, 1)
-    slope = straight_coeffs[0]  # dx/dy in pixels
-    fitted = np.polyval(straight_coeffs, ys)
-    residuals = xs - fitted
-    rmse = float(np.sqrt(np.mean(residuals ** 2)))
-
-    # Convert slope to angle: arctan(dx/dy) in degrees
-    import math
-    shear_degrees = math.degrees(math.atan(slope))
-
-    confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
-
-    result["shear_degrees"] = round(shear_degrees, 3)
-    result["confidence"] = round(float(confidence), 2)
-
-    return result
-
-
-def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
-    """Detect shear angle by maximising variance of horizontal text-line projections.
-
-    Principle: horizontal text lines produce a row-projection profile with sharp
-    peaks (high variance) when the image is correctly aligned.  Any residual shear
-    smears the peaks and reduces variance.  We sweep ±3° and pick the angle whose
-    corrected projection has the highest variance.
-
-    Works best on pages with clear horizontal banding (vocabulary tables, prose).
-    Complements _detect_shear_angle() which needs strong vertical edges.
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    import math
-    result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
-
-    h, w = img.shape[:2]
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    # Otsu binarisation
-    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
-    # Work at half resolution for speed
-    small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
-    sh, sw = small.shape
-
-    # 2-pass angle sweep for 10x better precision:
-    # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values)
-    # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values)
-
-    def _sweep_variance(angles_list):
-        results = []
-        for angle_deg in angles_list:
-            if abs(angle_deg) < 0.001:
-                rotated = small
-            else:
-                shear_tan = math.tan(math.radians(angle_deg))
-                M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
-                rotated = cv2.warpAffine(small, M, (sw, sh),
-                                         flags=cv2.INTER_NEAREST,
-                                         borderMode=cv2.BORDER_CONSTANT)
-            profile = np.sum(rotated, axis=1).astype(float)
-            results.append((angle_deg, float(np.var(profile))))
-        return results
-
-    # Pass 1: coarse
-    coarse_angles = [a * 0.5 for a in range(-6, 7)]  # 13 values
-    coarse_results = _sweep_variance(coarse_angles)
-    coarse_best = max(coarse_results, key=lambda x: x[1])
-
-    # Pass 2: fine around coarse best
-    fine_center = coarse_best[0]
-    fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]  # 21 values
-    fine_results = _sweep_variance(fine_angles)
-    fine_best = max(fine_results, key=lambda x: x[1])
-
-    best_angle = fine_best[0]
-    best_variance = fine_best[1]
-    variances = coarse_results + fine_results
-
-    # Confidence: how much sharper is the best angle vs. the mean?
-    all_mean = sum(v for _, v in variances) / len(variances)
-    if all_mean > 0 and best_variance > all_mean:
-        confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
-    else:
-        confidence = 0.0
-
-    result["shear_degrees"] = round(best_angle, 3)
-    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
-    return result
-
-
-def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
-    """Detect shear using Hough transform on printed table / ruled lines.
-
-    Vocabulary worksheets have near-horizontal printed table borders.  After
-    deskew these should be exactly horizontal; any residual tilt equals the
-    vertical shear angle (with inverted sign).
-
-    The sign convention: a horizontal line tilting +α degrees (left end lower)
-    means the page has vertical shear of -α degrees (left column edge drifts
-    to the left going downward).
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
-    h, w = img.shape[:2]
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
-    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
-
-    min_len = int(w * 0.15)
-    lines = cv2.HoughLinesP(
-        edges, rho=1, theta=np.pi / 360,
-        threshold=int(w * 0.08),
-        minLineLength=min_len,
-        maxLineGap=20,
-    )
-
-    if lines is None or len(lines) < 3:
-        return result
-
-    horizontal_angles: List[Tuple[float, float]] = []
-    for line in lines:
-        x1, y1, x2, y2 = line[0]
-        if x1 == x2:
-            continue
-        angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
-        if abs(angle) <= 5.0:
-            length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
-            horizontal_angles.append((angle, length))
-
-    if len(horizontal_angles) < 3:
-        return result
-
-    # Weighted median
-    angles_arr = np.array([a for a, _ in horizontal_angles])
-    weights_arr = np.array([l for _, l in horizontal_angles])
-    sorted_idx = np.argsort(angles_arr)
-    s_angles = angles_arr[sorted_idx]
-    s_weights = weights_arr[sorted_idx]
-    cum = np.cumsum(s_weights)
-    mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
-    median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
-
-    agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
-    confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
-
-    # Sign inversion: horizontal line tilt is complementary to vertical shear
-    shear_degrees = -median_angle
-
-    result["shear_degrees"] = round(shear_degrees, 3)
-    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
-    return result
-
-
-def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
-    """Detect shear by measuring text-line straightness (Method D).
-
-    Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
-    bounding boxes, groups them into vertical columns by X-proximity,
-    and measures how the left-edge X position drifts with Y (vertical
-    position).  The drift dx/dy is the tangent of the shear angle.
-
-    This directly measures vertical shear (column tilt) rather than
-    horizontal text-line slope, which is already corrected by deskew.
-
-    Returns:
-        Dict with keys: method, shear_degrees, confidence.
-    """
-    import math
-    result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
-    h, w = img.shape[:2]
-    # Downscale 50% for speed
-    scale = 0.5
-    small = cv2.resize(img, (int(w * scale), int(h * scale)),
-                       interpolation=cv2.INTER_AREA)
-    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
-    pil_img = Image.fromarray(gray)
-
-    try:
-        data = pytesseract.image_to_data(
-            pil_img, lang='eng+deu', config='--psm 11 --oem 3',
-            output_type=pytesseract.Output.DICT,
-        )
-    except Exception:
-        return result
-
-    # Collect word left-edges (x) and vertical centres (y)
-    words = []
-    for i in range(len(data['text'])):
-        text = data['text'][i].strip()
-        conf = int(data['conf'][i])
-        if not text or conf < 20 or len(text) < 2:
-            continue
-        left_x = float(data['left'][i])
-        cy = data['top'][i] + data['height'][i] / 2.0
-        word_w = float(data['width'][i])
-        words.append((left_x, cy, word_w))
-
-    if len(words) < 15:
-        return result
-
-    # --- Group words into vertical columns by left-edge X proximity ---
-    # Sort by x, then cluster words whose left-edges are within x_tol
-    avg_w = sum(ww for _, _, ww in words) / len(words)
-    x_tol = max(avg_w * 0.4, 8)  # tolerance for "same column"
-
-    words_by_x = sorted(words, key=lambda w: w[0])
-    columns: List[List[Tuple[float, float]]] = []  # each: [(left_x, cy), ...]
-    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
-    cur_x = words_by_x[0][0]
-
-    for lx, cy, _ in words_by_x[1:]:
-        if abs(lx - cur_x) <= x_tol:
-            cur_col.append((lx, cy))
-            # Update running x as median of cluster
-            cur_x = cur_x * 0.8 + lx * 0.2
-        else:
-            if len(cur_col) >= 5:
-                columns.append(cur_col)
-            cur_col = [(lx, cy)]
-            cur_x = lx
-    if len(cur_col) >= 5:
-        columns.append(cur_col)
-
-    if len(columns) < 2:
-        return result
-
-    # --- For each column, measure X-drift as a function of Y ---
-    # Fit: left_x = a * cy + b  →  a = dx/dy = tan(shear_angle)
-    drifts = []
-    for col in columns:
-        ys = np.array([p[1] for p in col])
-        xs = np.array([p[0] for p in col])
-        y_range = ys.max() - ys.min()
-        if y_range < h * scale * 0.3:
-            continue  # column must span at least 30% of image height
-        # Linear regression: x = a*y + b
-        coeffs = np.polyfit(ys, xs, 1)
-        drifts.append(coeffs[0])  # dx/dy
-
-    if len(drifts) < 2:
-        return result
-
-    # Median dx/dy → shear angle
-    # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
-    median_drift = float(np.median(drifts))
-    shear_degrees = math.degrees(math.atan(median_drift))
-
-    # Confidence from column count + drift consistency
-    drift_std = float(np.std(drifts))
-    consistency = max(0.0, 1.0 - drift_std * 50)  # tighter penalty for drift variance
-    count_factor = min(1.0, len(drifts) / 4.0)
-    confidence = count_factor * 0.5 + consistency * 0.5
-
-    result["shear_degrees"] = round(shear_degrees, 3)
-    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
-    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
-                "shear=%.3f°, conf=%.2f",
-                len(columns), len(drifts), median_drift,
-                shear_degrees, confidence)
-    return result
-
-
-def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
-    """Check whether the dewarp correction actually improved alignment.
-
-    Compares horizontal projection variance before and after correction.
-    Higher variance means sharper text-line peaks, which indicates better
-    horizontal alignment.
-
-    Returns True if the correction improved the image, False if it should
-    be discarded.
-    """
-    def _h_proj_variance(img: np.ndarray) -> float:
-        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        _, binary = cv2.threshold(gray, 0, 255,
-                                  cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-        small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
-                           interpolation=cv2.INTER_AREA)
-        profile = np.sum(small, axis=1).astype(float)
-        return float(np.var(profile))
-
-    var_before = _h_proj_variance(original)
-    var_after = _h_proj_variance(corrected)
-
-    # Correction must improve variance (even by a tiny margin)
-    return var_after > var_before
-
-
-def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
-    """Apply a vertical shear correction to an image.
-
-    Shifts each row horizontally proportional to its distance from the
-    vertical center. This corrects the tilt of vertical features (columns)
-    without affecting horizontal alignment (text lines).
-
-    Args:
-        img: BGR image.
-        shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
-
-    Returns:
-        Corrected image.
-    """
-    import math
-    h, w = img.shape[:2]
-    shear_tan = math.tan(math.radians(shear_degrees))
-
-    # Affine matrix: shift x by shear_tan * (y - h/2)
-    # [1  shear_tan  -h/2*shear_tan]
-    # [0  1          0             ]
-    M = np.float32([
-        [1, shear_tan, -h / 2.0 * shear_tan],
-        [0, 1, 0],
-    ])
-
-    corrected = cv2.warpAffine(img, M, (w, h),
-                                flags=cv2.INTER_LINEAR,
-                                borderMode=cv2.BORDER_REPLICATE)
-    return corrected
-
-
-def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
-    """Combine multiple shear detections into a single weighted estimate (v2).
-
-    Ensemble v2 changes vs v1:
-    - Minimum confidence raised to 0.5 (was 0.3)
-    - text_lines method gets 1.5× weight boost (most reliable detector)
-    - Outlier filter at 1° from weighted mean
-
-    Returns:
-        (shear_degrees, ensemble_confidence, methods_used_str)
-    """
-    # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
-    # that individual methods detect with moderate confidence.
-    _MIN_CONF = 0.35
-
-    # text_lines gets a weight boost as the most content-aware method
-    _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
-
-    accepted = []
-    for d in detections:
-        if d["confidence"] < _MIN_CONF:
-            continue
-        boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
-        effective_conf = d["confidence"] * boost
-        accepted.append((d["shear_degrees"], effective_conf, d["method"]))
-
-    if not accepted:
-        return 0.0, 0.0, "none"
-
-    if len(accepted) == 1:
-        deg, conf, method = accepted[0]
-        return deg, min(conf, 1.0), method
-
-    # First pass: weighted mean
-    total_w = sum(c for _, c, _ in accepted)
-    w_mean = sum(d * c for d, c, _ in accepted) / total_w
-
-    # Outlier filter: keep results within 1° of weighted mean
-    filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
-    if not filtered:
-        filtered = accepted  # fallback: keep all
-
-    # Second pass: weighted mean on filtered results
-    total_w2 = sum(c for _, c, _ in filtered)
-    final_deg = sum(d * c for d, c, _ in filtered) / total_w2
-
-    # Ensemble confidence: average of individual confidences, boosted when
-    # methods agree (all within 0.5° of each other)
-    avg_conf = total_w2 / len(filtered)
-    spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
-    agreement_bonus = 0.15 if spread < 0.5 else 0.0
-    ensemble_conf = min(1.0, avg_conf + agreement_bonus)
-
-    methods_str = "+".join(m for _, _, m in filtered)
-    return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
-
-
-def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
-    """Correct vertical shear after deskew (v2 with quality gate).
-
-    After deskew aligns horizontal text lines, vertical features (column
-    edges) may still be tilted. This detects the tilt angle using an ensemble
-    of four complementary methods and applies an affine shear correction.
-
-    Methods (all run in ~150ms total):
-        A. _detect_shear_angle()           — vertical edge profile (~50ms)
-        B. _detect_shear_by_projection()   — horizontal text-line variance (~30ms)
-        C. _detect_shear_by_hough()        — Hough lines on table borders (~20ms)
-        D. _detect_shear_by_text_lines()   — text-line straightness (~50ms)
-
-    Quality gate: after correction, horizontal projection variance is compared
-    before vs after. If correction worsened alignment, it is discarded.
-
-    Args:
-        img: BGR image (already deskewed).
-        use_ensemble: If False, fall back to single-method behaviour (method A only).
-
-    Returns:
-        Tuple of (corrected_image, dewarp_info).
-        dewarp_info keys: method, shear_degrees, confidence, detections.
-    """
-    no_correction = {
-        "method": "none",
-        "shear_degrees": 0.0,
-        "confidence": 0.0,
-        "detections": [],
-    }
-
-    if not CV2_AVAILABLE:
-        return img, no_correction
-
-    t0 = time.time()
-
-    if use_ensemble:
-        det_a = _detect_shear_angle(img)
-        det_b = _detect_shear_by_projection(img)
-        det_c = _detect_shear_by_hough(img)
-        det_d = _detect_shear_by_text_lines(img)
-        detections = [det_a, det_b, det_c, det_d]
-        shear_deg, confidence, method = _ensemble_shear(detections)
-    else:
-        det_a = _detect_shear_angle(img)
-        detections = [det_a]
-        shear_deg = det_a["shear_degrees"]
-        confidence = det_a["confidence"]
-        method = det_a["method"]
-
-    duration = time.time() - t0
-
-    logger.info(
-        "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
-        "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
-        shear_deg, confidence, method, duration,
-        detections[0]["shear_degrees"], detections[0]["confidence"],
-        detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
-        detections[1]["confidence"] if len(detections) > 1 else 0.0,
-        detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
-        detections[2]["confidence"] if len(detections) > 2 else 0.0,
-        detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
-        detections[3]["confidence"] if len(detections) > 3 else 0.0,
-    )
-
-    # Always include individual detections (even when no correction applied)
-    _all_detections = [
-        {"method": d["method"], "shear_degrees": d["shear_degrees"],
-         "confidence": d["confidence"]}
-        for d in detections
-    ]
-
-    # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
-    # For ensemble confidence, require at least 0.4 (lowered from 0.5 to
-    # catch moderate-confidence detections from multiple agreeing methods).
-    if abs(shear_deg) < 0.08 or confidence < 0.4:
-        no_correction["detections"] = _all_detections
-        return img, no_correction
-
-    # Apply correction (negate the detected shear to straighten)
-    corrected = _apply_shear(img, -shear_deg)
-
-    # Quality gate: verify the correction actually improved alignment.
-    # For small corrections (< 0.5°), the projection variance change can be
-    # negligible, so we skip the quality gate — the cost of a tiny wrong
-    # correction is much less than the cost of leaving 0.4° uncorrected
-    # (which shifts content ~25px at image edges on tall scans).
-    if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
-        logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
-                     "projection variance did not improve", shear_deg)
-        no_correction["detections"] = _all_detections
-        return img, no_correction
-
-    info = {
-        "method": method,
-        "shear_degrees": shear_deg,
-        "confidence": confidence,
-        "detections": _all_detections,
-    }
-
-    return corrected, info
-
-
-def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
-    """Apply shear correction with a manual angle.
-
-    Args:
-        img: BGR image (deskewed, before dewarp).
-        shear_degrees: Shear angle in degrees to correct.
-
-    Returns:
-        Corrected image.
-    """
-    if abs(shear_degrees) < 0.001:
-        return img
-    return _apply_shear(img, -shear_degrees)
-
diff --git a/klausur-service/backend/cv_preprocessing_deskew.py b/klausur-service/backend/cv_preprocessing_deskew.py
new file mode 100644
index 0000000..1bdb27e
--- /dev/null
+++ b/klausur-service/backend/cv_preprocessing_deskew.py
@@ -0,0 +1,437 @@
+"""
+CV Preprocessing Deskew — Rotation correction via Hough lines, word alignment, and iterative projection.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from collections import defaultdict
+from typing import Any, Dict, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Deskew via Hough Lines
+# =============================================================================
+
+def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
+    """Correct rotation using Hough Line detection.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Tuple of (corrected image, detected angle in degrees).
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
+                            minLineLength=img.shape[1] // 4, maxLineGap=20)
+
+    if lines is None or len(lines) < 3:
+        return img, 0.0
+
+    angles = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+        if abs(angle) < 15:
+            angles.append(angle)
+
+    if not angles:
+        return img, 0.0
+
+    median_angle = float(np.median(angles))
+
+    if abs(median_angle) > 5.0:
+        median_angle = 5.0 * np.sign(median_angle)
+
+    if abs(median_angle) < 0.1:
+        return img, 0.0
+
+    h, w = img.shape[:2]
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+    corrected = cv2.warpAffine(img, M, (w, h),
+                               flags=cv2.INTER_LINEAR,
+                               borderMode=cv2.BORDER_REPLICATE)
+
+    logger.info(f"Deskew: corrected {median_angle:.2f}\u00b0 rotation")
+    return corrected, median_angle
+
+
+# =============================================================================
+# Deskew via Word Alignment
+# =============================================================================
+
+def deskew_image_by_word_alignment(
+    image_data: bytes,
+    lang: str = "eng+deu",
+    downscale_factor: float = 0.5,
+) -> Tuple[bytes, float]:
+    """Correct rotation by fitting a line through left-most word starts per text line.
+
+    More robust than Hough-based deskew for vocabulary worksheets where text lines
+    have consistent left-alignment.
+
+    Args:
+        image_data: Raw image bytes (PNG/JPEG).
+        lang: Tesseract language string for the quick pass.
+        downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
+
+    Returns:
+        Tuple of (rotated image as PNG bytes, detected angle in degrees).
+    """
+    if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
+        return image_data, 0.0
+
+    img_array = np.frombuffer(image_data, dtype=np.uint8)
+    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    if img is None:
+        logger.warning("deskew_by_word_alignment: could not decode image")
+        return image_data, 0.0
+
+    orig_h, orig_w = img.shape[:2]
+
+    small_w = int(orig_w * downscale_factor)
+    small_h = int(orig_h * downscale_factor)
+    small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
+
+    pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
+    try:
+        data = pytesseract.image_to_data(
+            pil_small, lang=lang, config="--psm 6 --oem 3",
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception as e:
+        logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
+        return image_data, 0.0
+
+    line_groups: Dict[tuple, list] = defaultdict(list)
+    for i in range(len(data["text"])):
+        text = (data["text"][i] or "").strip()
+        conf = int(data["conf"][i])
+        if not text or conf < 20:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        line_groups[key].append(i)
+
+    if len(line_groups) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
+        return image_data, 0.0
+
+    scale = 1.0 / downscale_factor
+    points = []
+    for key, indices in line_groups.items():
+        best_idx = min(indices, key=lambda i: data["left"][i])
+        lx = data["left"][best_idx] * scale
+        top = data["top"][best_idx] * scale
+        h = data["height"][best_idx] * scale
+        cy = top + h / 2.0
+        points.append((lx, cy))
+
+    xs = np.array([p[0] for p in points])
+    ys = np.array([p[1] for p in points])
+    median_x = float(np.median(xs))
+    tolerance = orig_w * 0.03
+
+    mask = np.abs(xs - median_x) <= tolerance
+    filtered_xs = xs[mask]
+    filtered_ys = ys[mask]
+
+    if len(filtered_xs) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
+        return image_data, 0.0
+
+    coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
+    slope = coeffs[0]
+    angle_rad = np.arctan(slope)
+    angle_deg = float(np.degrees(angle_rad))
+
+    angle_deg = max(-5.0, min(5.0, angle_deg))
+
+    logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}\u00b0 from {len(filtered_xs)} points "
+                f"(total lines: {len(line_groups)})")
+
+    if abs(angle_deg) < 0.05:
+        return image_data, 0.0
+
+    center = (orig_w // 2, orig_h // 2)
+    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+    rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    success, png_buf = cv2.imencode(".png", rotated)
+    if not success:
+        logger.warning("deskew_by_word_alignment: PNG encoding failed")
+        return image_data, 0.0
+
+    return png_buf.tobytes(), angle_deg
+
+
+# =============================================================================
+# Projection Gradient Scoring
+# =============================================================================
+
+def _projection_gradient_score(profile: np.ndarray) -> float:
+    """Score a projection profile by the L2-norm of its first derivative."""
+    diff = np.diff(profile)
+    return float(np.sum(diff * diff))
+
+
+# =============================================================================
+# Iterative Deskew (Vertical-Edge Projection)
+# =============================================================================
+
+def deskew_image_iterative(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+    coarse_step: float = 0.1,
+    fine_range: float = 0.15,
+    fine_step: float = 0.02,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Iterative deskew using vertical-edge projection optimisation.
+
+    Args:
+        img: BGR image (full resolution).
+        coarse_range: half-range in degrees for the coarse sweep.
+        coarse_step: step size in degrees for the coarse sweep.
+        fine_range: half-range around the coarse winner for the fine sweep.
+        fine_step: step size in degrees for the fine sweep.
+
+    Returns:
+        (rotated_bgr, angle_degrees, debug_dict)
+    """
+    h, w = img.shape[:2]
+    debug: Dict[str, Any] = {}
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    y_lo, y_hi = int(h * 0.15), int(h * 0.85)
+    x_lo, x_hi = int(w * 0.10), int(w * 0.90)
+    gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
+
+    sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
+    edges = np.abs(sobel_x)
+    edge_max = edges.max()
+    if edge_max > 0:
+        edges = (edges / edge_max * 255).astype(np.uint8)
+    else:
+        return img, 0.0, {"error": "no edges detected"}
+
+    crop_h, crop_w = edges.shape[:2]
+    crop_center = (crop_w // 2, crop_h // 2)
+
+    trim_y = max(4, int(crop_h * 0.03))
+    trim_x = max(4, int(crop_w * 0.03))
+
+    def _sweep_edges(angles: np.ndarray) -> list:
+        results = []
+        for angle in angles:
+            if abs(angle) < 1e-6:
+                rotated = edges
+            else:
+                M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
+                rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_REPLICATE)
+            trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
+            v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
+            score = _projection_gradient_score(v_profile)
+            results.append((float(angle), score))
+        return results
+
+    coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
+    coarse_results = _sweep_edges(coarse_angles)
+    best_coarse = max(coarse_results, key=lambda x: x[1])
+    best_coarse_angle, best_coarse_score = best_coarse
+
+    debug["coarse_best_angle"] = round(best_coarse_angle, 2)
+    debug["coarse_best_score"] = round(best_coarse_score, 1)
+    debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
+
+    fine_lo = best_coarse_angle - fine_range
+    fine_hi = best_coarse_angle + fine_range
+    fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
+    fine_results = _sweep_edges(fine_angles)
+    best_fine = max(fine_results, key=lambda x: x[1])
+    best_fine_angle, best_fine_score = best_fine
+
+    debug["fine_best_angle"] = round(best_fine_angle, 2)
+    debug["fine_best_score"] = round(best_fine_score, 1)
+    debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
+
+    final_angle = best_fine_angle
+    final_angle = max(-5.0, min(5.0, final_angle))
+
+    logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}\u00b0 fine={best_fine_angle:.2f}\u00b0 -> {final_angle:.2f}\u00b0")
+
+    if abs(final_angle) < 0.05:
+        return img, 0.0, debug
+
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    return rotated, final_angle, debug
+
+
+# =============================================================================
+# Text-Line Slope Measurement
+# =============================================================================
+
+def _measure_textline_slope(img: np.ndarray) -> float:
+    """Measure residual text-line slope via Tesseract word-position regression."""
+    import math as _math
+
+    if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
+        return 0.0
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    data = pytesseract.image_to_data(
+        Image.fromarray(gray),
+        output_type=pytesseract.Output.DICT,
+        config="--psm 6",
+    )
+
+    lines: Dict[tuple, list] = {}
+    for i in range(len(data["text"])):
+        txt = (data["text"][i] or "").strip()
+        if len(txt) < 2 or int(data["conf"][i]) < 30:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        cx = data["left"][i] + data["width"][i] / 2.0
+        cy = data["top"][i] + data["height"][i] / 2.0
+        lines.setdefault(key, []).append((cx, cy))
+
+    slopes: list = []
+    for pts in lines.values():
+        if len(pts) < 3:
+            continue
+        pts.sort(key=lambda p: p[0])
+        xs = np.array([p[0] for p in pts], dtype=np.float64)
+        ys = np.array([p[1] for p in pts], dtype=np.float64)
+        if xs[-1] - xs[0] < w * 0.15:
+            continue
+        A = np.vstack([xs, np.ones_like(xs)]).T
+        result = np.linalg.lstsq(A, ys, rcond=None)
+        slope = result[0][0]
+        slopes.append(_math.degrees(_math.atan(slope)))
+
+    if len(slopes) < 3:
+        return 0.0
+
+    slopes.sort()
+    trim = max(1, len(slopes) // 10)
+    trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
+    if not trimmed:
+        return 0.0
+
+    return sum(trimmed) / len(trimmed)
+
+
+# =============================================================================
+# Two-Pass Deskew
+# =============================================================================
+
+def deskew_two_pass(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Two-pass deskew: iterative projection + word-alignment residual check.
+
+    Returns:
+        (corrected_bgr, total_angle_degrees, debug_dict)
+    """
+    debug: Dict[str, Any] = {}
+
+    # --- Pass 1: iterative projection ---
+    corrected, angle1, dbg1 = deskew_image_iterative(
+        img.copy(), coarse_range=coarse_range,
+    )
+    debug["pass1_angle"] = round(angle1, 3)
+    debug["pass1_method"] = "iterative"
+    debug["pass1_debug"] = dbg1
+
+    # --- Pass 2: word-alignment residual check ---
+    angle2 = 0.0
+    try:
+        ok, buf = cv2.imencode(".png", corrected)
+        if ok:
+            corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
+            if abs(angle2) >= 0.3:
+                arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
+                corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
+                if corrected2 is not None:
+                    corrected = corrected2
+                    logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 applied "
+                                f"(total={angle1 + angle2:.2f}\u00b0)")
+                else:
+                    angle2 = 0.0
+            else:
+                logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 < 0.3\u00b0 -- skipped")
+                angle2 = 0.0
+    except Exception as e:
+        logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
+        angle2 = 0.0
+
+    # --- Pass 3: Tesseract text-line regression residual check ---
+    angle3 = 0.0
+    try:
+        residual = _measure_textline_slope(corrected)
+        debug["pass3_raw"] = round(residual, 3)
+        if abs(residual) >= 0.3:
+            h3, w3 = corrected.shape[:2]
+            center3 = (w3 // 2, h3 // 2)
+            M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
+            corrected = cv2.warpAffine(
+                corrected, M3, (w3, h3),
+                flags=cv2.INTER_LINEAR,
+                borderMode=cv2.BORDER_REPLICATE,
+            )
+            angle3 = residual
+            logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 applied", residual)
+        else:
+            logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 < 0.3\u00b0 -- skipped", residual)
+    except Exception as e:
+        logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
+
+    total_angle = angle1 + angle2 + angle3
+    debug["pass2_angle"] = round(angle2, 3)
+    debug["pass2_method"] = "word_alignment"
+    debug["pass3_angle"] = round(angle3, 3)
+    debug["pass3_method"] = "textline_regression"
+    debug["total_angle"] = round(total_angle, 3)
+
+    logger.info(
+        "deskew_two_pass: pass1=%.2f\u00b0 + pass2=%.2f\u00b0 + pass3=%.2f\u00b0 = %.2f\u00b0",
+        angle1, angle2, angle3, total_angle,
+    )
+
+    return corrected, total_angle, debug
diff --git a/klausur-service/backend/cv_preprocessing_dewarp.py b/klausur-service/backend/cv_preprocessing_dewarp.py
new file mode 100644
index 0000000..640c87c
--- /dev/null
+++ b/klausur-service/backend/cv_preprocessing_dewarp.py
@@ -0,0 +1,474 @@
+"""
+CV Preprocessing Dewarp — Vertical shear detection and correction.
+
+Provides four shear detection methods (vertical edge, projection variance,
+Hough lines, text-line drift), ensemble combination, quality gating,
+and the main dewarp_image() function.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import math
+import time
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Shear Detection Methods
+# =============================================================================
+
+def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
+    """Detect vertical shear angle via strongest vertical edge tracking (Method A)."""
+    h, w = img.shape[:2]
+    result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+    abs_sobel = np.abs(sobel_x).astype(np.uint8)
+
+    _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+    num_strips = 20
+    strip_h = h // num_strips
+    edge_positions = []
+
+    for i in range(num_strips):
+        y_start = i * strip_h
+        y_end = min((i + 1) * strip_h, h)
+        strip = binary[y_start:y_end, :]
+
+        projection = np.sum(strip, axis=0).astype(np.float64)
+        if projection.max() == 0:
+            continue
+
+        search_w = int(w * 0.4)
+        left_proj = projection[:search_w]
+        if left_proj.max() == 0:
+            continue
+
+        kernel_size = max(3, w // 100)
+        if kernel_size % 2 == 0:
+            kernel_size += 1
+        smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
+        x_pos = float(np.argmax(smoothed))
+        y_center = (y_start + y_end) / 2.0
+        edge_positions.append((y_center, x_pos))
+
+    if len(edge_positions) < 8:
+        return result
+
+    ys = np.array([p[0] for p in edge_positions])
+    xs = np.array([p[1] for p in edge_positions])
+
+    median_x = np.median(xs)
+    std_x = max(np.std(xs), 1.0)
+    mask = np.abs(xs - median_x) < 2 * std_x
+    ys = ys[mask]
+    xs = xs[mask]
+
+    if len(ys) < 6:
+        return result
+
+    straight_coeffs = np.polyfit(ys, xs, 1)
+    slope = straight_coeffs[0]
+    fitted = np.polyval(straight_coeffs, ys)
+    residuals = xs - fitted
+    rmse = float(np.sqrt(np.mean(residuals ** 2)))
+
+    shear_degrees = math.degrees(math.atan(slope))
+
+    confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(float(confidence), 2)
+
+    return result
+
+
+def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear angle by maximising variance of horizontal text-line projections (Method B)."""
+    result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
+    sh, sw = small.shape
+
+    def _sweep_variance(angles_list):
+        results = []
+        for angle_deg in angles_list:
+            if abs(angle_deg) < 0.001:
+                rotated = small
+            else:
+                shear_tan = math.tan(math.radians(angle_deg))
+                M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
+                rotated = cv2.warpAffine(small, M, (sw, sh),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_CONSTANT)
+            profile = np.sum(rotated, axis=1).astype(float)
+            results.append((angle_deg, float(np.var(profile))))
+        return results
+
+    coarse_angles = [a * 0.5 for a in range(-6, 7)]
+    coarse_results = _sweep_variance(coarse_angles)
+    coarse_best = max(coarse_results, key=lambda x: x[1])
+
+    fine_center = coarse_best[0]
+    fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]
+    fine_results = _sweep_variance(fine_angles)
+    fine_best = max(fine_results, key=lambda x: x[1])
+
+    best_angle = fine_best[0]
+    best_variance = fine_best[1]
+    variances = coarse_results + fine_results
+
+    all_mean = sum(v for _, v in variances) / len(variances)
+    if all_mean > 0 and best_variance > all_mean:
+        confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
+    else:
+        confidence = 0.0
+
+    result["shear_degrees"] = round(best_angle, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear using Hough transform on printed table / ruled lines (Method C)."""
+    result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+
+    min_len = int(w * 0.15)
+    lines = cv2.HoughLinesP(
+        edges, rho=1, theta=np.pi / 360,
+        threshold=int(w * 0.08),
+        minLineLength=min_len,
+        maxLineGap=20,
+    )
+
+    if lines is None or len(lines) < 3:
+        return result
+
+    horizontal_angles: List[Tuple[float, float]] = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        if x1 == x2:
+            continue
+        angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
+        if abs(angle) <= 5.0:
+            length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
+            horizontal_angles.append((angle, length))
+
+    if len(horizontal_angles) < 3:
+        return result
+
+    angles_arr = np.array([a for a, _ in horizontal_angles])
+    weights_arr = np.array([l for _, l in horizontal_angles])
+    sorted_idx = np.argsort(angles_arr)
+    s_angles = angles_arr[sorted_idx]
+    s_weights = weights_arr[sorted_idx]
+    cum = np.cumsum(s_weights)
+    mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
+    median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
+
+    agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
+    confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
+
+    shear_degrees = -median_angle
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear by measuring text-line straightness (Method D)."""
+    result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    scale = 0.5
+    small = cv2.resize(img, (int(w * scale), int(h * scale)),
+                       interpolation=cv2.INTER_AREA)
+    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+    pil_img = Image.fromarray(gray)
+
+    try:
+        data = pytesseract.image_to_data(
+            pil_img, lang='eng+deu', config='--psm 11 --oem 3',
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception:
+        return result
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 20 or len(text) < 2:
+            continue
+        left_x = float(data['left'][i])
+        cy = data['top'][i] + data['height'][i] / 2.0
+        word_w = float(data['width'][i])
+        words.append((left_x, cy, word_w))
+
+    if len(words) < 15:
+        return result
+
+    avg_w = sum(ww for _, _, ww in words) / len(words)
+    x_tol = max(avg_w * 0.4, 8)
+
+    words_by_x = sorted(words, key=lambda w: w[0])
+    columns: List[List[Tuple[float, float]]] = []
+    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+    cur_x = words_by_x[0][0]
+
+    for lx, cy, _ in words_by_x[1:]:
+        if abs(lx - cur_x) <= x_tol:
+            cur_col.append((lx, cy))
+            cur_x = cur_x * 0.8 + lx * 0.2
+        else:
+            if len(cur_col) >= 5:
+                columns.append(cur_col)
+            cur_col = [(lx, cy)]
+            cur_x = lx
+    if len(cur_col) >= 5:
+        columns.append(cur_col)
+
+    if len(columns) < 2:
+        return result
+
+    drifts = []
+    for col in columns:
+        ys = np.array([p[1] for p in col])
+        xs = np.array([p[0] for p in col])
+        y_range = ys.max() - ys.min()
+        if y_range < h * scale * 0.3:
+            continue
+        coeffs = np.polyfit(ys, xs, 1)
+        drifts.append(coeffs[0])
+
+    if len(drifts) < 2:
+        return result
+
+    median_drift = float(np.median(drifts))
+    shear_degrees = math.degrees(math.atan(median_drift))
+
+    drift_std = float(np.std(drifts))
+    consistency = max(0.0, 1.0 - drift_std * 50)
+    count_factor = min(1.0, len(drifts) / 4.0)
+    confidence = count_factor * 0.5 + consistency * 0.5
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+                "shear=%.3f\u00b0, conf=%.2f",
+                len(columns), len(drifts), median_drift,
+                shear_degrees, confidence)
+    return result
+
+
+# =============================================================================
+# Quality Check and Shear Application
+# =============================================================================
+
+def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
+    """Check whether the dewarp correction actually improved alignment."""
+    def _h_proj_variance(img: np.ndarray) -> float:
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        _, binary = cv2.threshold(gray, 0, 255,
+                                  cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
+                           interpolation=cv2.INTER_AREA)
+        profile = np.sum(small, axis=1).astype(float)
+        return float(np.var(profile))
+
+    var_before = _h_proj_variance(original)
+    var_after = _h_proj_variance(corrected)
+
+    return var_after > var_before
+
+
+def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply a vertical shear correction to an image."""
+    h, w = img.shape[:2]
+    shear_tan = math.tan(math.radians(shear_degrees))
+
+    M = np.float32([
+        [1, shear_tan, -h / 2.0 * shear_tan],
+        [0, 1, 0],
+    ])
+
+    corrected = cv2.warpAffine(img, M, (w, h),
+                                flags=cv2.INTER_LINEAR,
+                                borderMode=cv2.BORDER_REPLICATE)
+    return corrected
+
+
+# =============================================================================
+# Ensemble Shear Combination
+# =============================================================================
+
+def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
+    """Combine multiple shear detections into a single weighted estimate (v2)."""
+    _MIN_CONF = 0.35
+    _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
+
+    accepted = []
+    for d in detections:
+        if d["confidence"] < _MIN_CONF:
+            continue
+        boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
+        effective_conf = d["confidence"] * boost
+        accepted.append((d["shear_degrees"], effective_conf, d["method"]))
+
+    if not accepted:
+        return 0.0, 0.0, "none"
+
+    if len(accepted) == 1:
+        deg, conf, method = accepted[0]
+        return deg, min(conf, 1.0), method
+
+    total_w = sum(c for _, c, _ in accepted)
+    w_mean = sum(d * c for d, c, _ in accepted) / total_w
+
+    filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
+    if not filtered:
+        filtered = accepted
+
+    total_w2 = sum(c for _, c, _ in filtered)
+    final_deg = sum(d * c for d, c, _ in filtered) / total_w2
+
+    avg_conf = total_w2 / len(filtered)
+    spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
+    agreement_bonus = 0.15 if spread < 0.5 else 0.0
+    ensemble_conf = min(1.0, avg_conf + agreement_bonus)
+
+    methods_str = "+".join(m for _, _, m in filtered)
+    return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
+
+
+# =============================================================================
+# Main Dewarp Function
+# =============================================================================
+
+def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
+    """Correct vertical shear after deskew (v2 with quality gate).
+
+    Methods (all run in ~150ms total):
+        A. _detect_shear_angle()           -- vertical edge profile (~50ms)
+        B. _detect_shear_by_projection()   -- horizontal text-line variance (~30ms)
+        C. _detect_shear_by_hough()        -- Hough lines on table borders (~20ms)
+        D. _detect_shear_by_text_lines()   -- text-line straightness (~50ms)
+
+    Args:
+        img: BGR image (already deskewed).
+        use_ensemble: If False, fall back to single-method behaviour (method A only).
+
+    Returns:
+        Tuple of (corrected_image, dewarp_info).
+    """
+    no_correction = {
+        "method": "none",
+        "shear_degrees": 0.0,
+        "confidence": 0.0,
+        "detections": [],
+    }
+
+    if not CV2_AVAILABLE:
+        return img, no_correction
+
+    t0 = time.time()
+
+    if use_ensemble:
+        det_a = _detect_shear_angle(img)
+        det_b = _detect_shear_by_projection(img)
+        det_c = _detect_shear_by_hough(img)
+        det_d = _detect_shear_by_text_lines(img)
+        detections = [det_a, det_b, det_c, det_d]
+        shear_deg, confidence, method = _ensemble_shear(detections)
+    else:
+        det_a = _detect_shear_angle(img)
+        detections = [det_a]
+        shear_deg = det_a["shear_degrees"]
+        confidence = det_a["confidence"]
+        method = det_a["method"]
+
+    duration = time.time() - t0
+
+    logger.info(
+        "dewarp: ensemble shear=%.3f\u00b0 conf=%.2f method=%s (%.2fs) | "
+        "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
+        shear_deg, confidence, method, duration,
+        detections[0]["shear_degrees"], detections[0]["confidence"],
+        detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
+        detections[1]["confidence"] if len(detections) > 1 else 0.0,
+        detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
+        detections[2]["confidence"] if len(detections) > 2 else 0.0,
+        detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
+        detections[3]["confidence"] if len(detections) > 3 else 0.0,
+    )
+
+    _all_detections = [
+        {"method": d["method"], "shear_degrees": d["shear_degrees"],
+         "confidence": d["confidence"]}
+        for d in detections
+    ]
+
+    if abs(shear_deg) < 0.08 or confidence < 0.4:
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    corrected = _apply_shear(img, -shear_deg)
+
+    if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
+        logger.info("dewarp: quality gate REJECTED correction (%.3f\u00b0) -- "
+                     "projection variance did not improve", shear_deg)
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    info = {
+        "method": method,
+        "shear_degrees": shear_deg,
+        "confidence": confidence,
+        "detections": _all_detections,
+    }
+
+    return corrected, info
+
+
+def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply shear correction with a manual angle."""
+    if abs(shear_degrees) < 0.001:
+        return img
+    return _apply_shear(img, -shear_degrees)
diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py
index 5da85c2..217e463 100644
--- a/klausur-service/backend/cv_review.py
+++ b/klausur-service/backend/cv_review.py
@@ -1,1248 +1,46 @@
 """
 Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
 
+Re-export facade -- all logic lives in the sub-modules:
+
+  cv_review_pipeline   Stages 6-8: OCR, line alignment, orchestrator
+  cv_review_spell      Rule-based spell-checker OCR correction
+  cv_review_llm        LLM-based OCR correction, prompt building, streaming
+
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 
-import json
-import logging
-import os
-import re
-import time
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-
-from cv_vocab_types import (
-    CV_PIPELINE_AVAILABLE,
-    PageRegion,
-    PipelineResult,
-    VocabRow,
-)
-from cv_preprocessing import (
-    deskew_image,
-    dewarp_image,
-    render_image_high_res,
-    render_pdf_high_res,
-)
-from cv_layout import (
-    analyze_layout,
-    create_layout_image,
-    create_ocr_image,
-)
-from cv_ocr_engines import (
-    _fix_character_confusion,
-    _group_words_into_lines,
+# Re-export everything for backward compatibility
+from cv_review_pipeline import (  # noqa: F401
+    ocr_region,
+    run_multi_pass_ocr,
+    match_lines_to_vocab,
+    llm_post_correct,
+    run_cv_pipeline,
 )
 
-logger = logging.getLogger(__name__)
-
-try:
-    import cv2
-except ImportError:
-    cv2 = None  # type: ignore[assignment]
-
-try:
-    import pytesseract
-    from PIL import Image
-except ImportError:
-    pytesseract = None  # type: ignore[assignment]
-    Image = None  # type: ignore[assignment,misc]
-
-
-# =============================================================================
-# Stage 6: Multi-Pass OCR
-# =============================================================================
-
-def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
-               psm: int, fallback_psm: Optional[int] = None,
-               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
-    """Run Tesseract OCR on a specific region with given PSM.
-
-    Args:
-        ocr_img: Binarized full-page image.
-        region: Region to crop and OCR.
-        lang: Tesseract language string.
-        psm: Page Segmentation Mode.
-        fallback_psm: If confidence too low, retry with this PSM per line.
-        min_confidence: Minimum average confidence before fallback.
-
-    Returns:
-        List of word dicts with text, position, confidence.
-    """
-    # Crop region
-    crop = ocr_img[region.y:region.y + region.height,
-                   region.x:region.x + region.width]
-
-    if crop.size == 0:
-        return []
-
-    # Convert to PIL for pytesseract
-    pil_img = Image.fromarray(crop)
-
-    # Run Tesseract with specified PSM
-    config = f'--psm {psm} --oem 3'
-    try:
-        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
-                                         output_type=pytesseract.Output.DICT)
-    except Exception as e:
-        logger.warning(f"Tesseract failed for region {region.type}: {e}")
-        return []
-
-    words = []
-    for i in range(len(data['text'])):
-        text = data['text'][i].strip()
-        conf = int(data['conf'][i])
-        if not text or conf < 10:
-            continue
-        words.append({
-            'text': text,
-            'left': data['left'][i] + region.x,  # Absolute coords
-            'top': data['top'][i] + region.y,
-            'width': data['width'][i],
-            'height': data['height'][i],
-            'conf': conf,
-            'region_type': region.type,
-        })
-
-    # Check average confidence
-    if words and fallback_psm is not None:
-        avg_conf = sum(w['conf'] for w in words) / len(words)
-        if avg_conf < min_confidence:
-            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
-                        f"trying fallback PSM {fallback_psm}")
-            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
-
-    return words
-
-
-def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
-                              lang: str, psm: int) -> List[Dict[str, Any]]:
-    """OCR a region line by line (fallback for low-confidence regions).
-
-    Splits the region into horizontal strips based on text density,
-    then OCRs each strip individually with the given PSM.
-    """
-    crop = ocr_img[region.y:region.y + region.height,
-                   region.x:region.x + region.width]
-
-    if crop.size == 0:
-        return []
-
-    # Find text lines via horizontal projection
-    inv = cv2.bitwise_not(crop)
-    h_proj = np.sum(inv, axis=1)
-    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
-
-    # Find line boundaries
-    lines = []
-    in_text = False
-    line_start = 0
-    for y in range(len(h_proj)):
-        if h_proj[y] > threshold and not in_text:
-            line_start = y
-            in_text = True
-        elif h_proj[y] <= threshold and in_text:
-            if y - line_start > 5:  # Minimum line height
-                lines.append((line_start, y))
-            in_text = False
-    if in_text and len(h_proj) - line_start > 5:
-        lines.append((line_start, len(h_proj)))
-
-    all_words = []
-    config = f'--psm {psm} --oem 3'
-
-    for line_y_start, line_y_end in lines:
-        # Add small padding
-        pad = 3
-        y1 = max(0, line_y_start - pad)
-        y2 = min(crop.shape[0], line_y_end + pad)
-        line_crop = crop[y1:y2, :]
-
-        if line_crop.size == 0:
-            continue
-
-        pil_img = Image.fromarray(line_crop)
-        try:
-            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
-                                             output_type=pytesseract.Output.DICT)
-        except Exception:
-            continue
-
-        for i in range(len(data['text'])):
-            text = data['text'][i].strip()
-            conf = int(data['conf'][i])
-            if not text or conf < 10:
-                continue
-            all_words.append({
-                'text': text,
-                'left': data['left'][i] + region.x,
-                'top': data['top'][i] + region.y + y1,
-                'width': data['width'][i],
-                'height': data['height'][i],
-                'conf': conf,
-                'region_type': region.type,
-            })
-
-    return all_words
-
-
-def run_multi_pass_ocr(ocr_img: np.ndarray,
-                       regions: List[PageRegion],
-                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
-    """Run OCR on each detected region with optimized settings.
-
-    Args:
-        ocr_img: Binarized full-page image.
-        regions: Detected page regions.
-        lang: Default language.
-
-    Returns:
-        Dict mapping region type to list of word dicts.
-    """
-    results: Dict[str, List[Dict]] = {}
-
-    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
-    for region in regions:
-        if region.type in _ocr_skip:
-            continue  # Skip non-content regions
-
-        if region.type == 'column_en':
-            words = ocr_region(ocr_img, region, lang='eng', psm=4)
-        elif region.type == 'column_de':
-            words = ocr_region(ocr_img, region, lang='deu', psm=4)
-        elif region.type == 'column_example':
-            words = ocr_region(ocr_img, region, lang=lang, psm=6,
-                              fallback_psm=7, min_confidence=40.0)
-        else:
-            words = ocr_region(ocr_img, region, lang=lang, psm=6)
-
-        results[region.type] = words
-        logger.info(f"OCR {region.type}: {len(words)} words")
-
-    return results
-
-
-# =============================================================================
-# Stage 7: Line Alignment → Vocabulary Entries
-# =============================================================================
-
-def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
-                          regions: List[PageRegion],
-                          y_tolerance_px: int = 25) -> List[VocabRow]:
-    """Align OCR results from different columns into vocabulary rows.
-
-    Uses Y-coordinate matching to pair English words, German translations,
-    and example sentences that appear on the same line.
-
-    Args:
-        ocr_results: Dict mapping region type to word lists.
-        regions: Detected regions (for reference).
-        y_tolerance_px: Max Y-distance to consider words on the same row.
-
-    Returns:
-        List of VocabRow objects.
-    """
-    # If no vocabulary columns detected (e.g. plain text page), return empty
-    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
-        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
-        return []
-
-    # Group words into lines per column
-    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
-    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
-    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
-
-    def line_y_center(line: List[Dict]) -> float:
-        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
-
-    def line_text(line: List[Dict]) -> str:
-        return ' '.join(w['text'] for w in line)
-
-    def line_confidence(line: List[Dict]) -> float:
-        return sum(w['conf'] for w in line) / len(line) if line else 0
-
-    # Build EN entries as the primary reference
-    vocab_rows: List[VocabRow] = []
-
-    for en_line in en_lines:
-        en_y = line_y_center(en_line)
-        en_text = line_text(en_line)
-        en_conf = line_confidence(en_line)
-
-        # Skip very short or likely header content
-        if len(en_text.strip()) < 2:
-            continue
-
-        # Find matching DE line
-        de_text = ""
-        de_conf = 0.0
-        best_de_dist = float('inf')
-        best_de_idx = -1
-        for idx, de_line in enumerate(de_lines):
-            dist = abs(line_y_center(de_line) - en_y)
-            if dist < y_tolerance_px and dist < best_de_dist:
-                best_de_dist = dist
-                best_de_idx = idx
-
-        if best_de_idx >= 0:
-            de_text = line_text(de_lines[best_de_idx])
-            de_conf = line_confidence(de_lines[best_de_idx])
-
-        # Find matching example line
-        ex_text = ""
-        ex_conf = 0.0
-        best_ex_dist = float('inf')
-        best_ex_idx = -1
-        for idx, ex_line in enumerate(ex_lines):
-            dist = abs(line_y_center(ex_line) - en_y)
-            if dist < y_tolerance_px and dist < best_ex_dist:
-                best_ex_dist = dist
-                best_ex_idx = idx
-
-        if best_ex_idx >= 0:
-            ex_text = line_text(ex_lines[best_ex_idx])
-            ex_conf = line_confidence(ex_lines[best_ex_idx])
-
-        avg_conf = en_conf
-        conf_count = 1
-        if de_conf > 0:
-            avg_conf += de_conf
-            conf_count += 1
-        if ex_conf > 0:
-            avg_conf += ex_conf
-            conf_count += 1
-
-        vocab_rows.append(VocabRow(
-            english=en_text.strip(),
-            german=de_text.strip(),
-            example=ex_text.strip(),
-            confidence=avg_conf / conf_count,
-            y_position=int(en_y),
-        ))
-
-    # Handle multi-line wrapping in example column:
-    # If an example line has no matching EN/DE, append to previous entry
-    matched_ex_ys = set()
-    for row in vocab_rows:
-        if row.example:
-            matched_ex_ys.add(row.y_position)
-
-    for ex_line in ex_lines:
-        ex_y = line_y_center(ex_line)
-        # Check if already matched
-        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
-        if already_matched:
-            continue
-
-        # Find nearest previous vocab row
-        best_row = None
-        best_dist = float('inf')
-        for row in vocab_rows:
-            dist = ex_y - row.y_position
-            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
-                best_dist = dist
-                best_row = row
-
-        if best_row:
-            continuation = line_text(ex_line).strip()
-            if continuation:
-                best_row.example = (best_row.example + " " + continuation).strip()
-
-    # Sort by Y position
-    vocab_rows.sort(key=lambda r: r.y_position)
-
-    return vocab_rows
-
-
-# =============================================================================
-# Stage 8: Optional LLM Post-Correction
-# =============================================================================
-
-async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
-                           confidence_threshold: float = 50.0,
-                           enabled: bool = False) -> List[VocabRow]:
-    """Optionally send low-confidence regions to Qwen-VL for correction.
-
-    Default: disabled. Enable per parameter.
-
-    Args:
-        img: Original BGR image.
-        vocab_rows: Current vocabulary rows.
-        confidence_threshold: Rows below this get LLM correction.
-        enabled: Whether to actually run LLM correction.
-
-    Returns:
-        Corrected vocabulary rows.
-    """
-    if not enabled:
-        return vocab_rows
-
-    # TODO: Implement Qwen-VL correction for low-confidence entries
-    # For each row with confidence < threshold:
-    #   1. Crop the relevant region from img
-    #   2. Send crop + OCR text to Qwen-VL
-    #   3. Replace text if LLM provides a confident correction
-    logger.info(f"LLM post-correction skipped (not yet implemented)")
-    return vocab_rows
-
-
-# =============================================================================
-# Orchestrator
-# =============================================================================
-
-async def run_cv_pipeline(
-    pdf_data: Optional[bytes] = None,
-    image_data: Optional[bytes] = None,
-    page_number: int = 0,
-    zoom: float = 3.0,
-    enable_dewarp: bool = True,
-    enable_llm_correction: bool = False,
-    lang: str = "eng+deu",
-) -> PipelineResult:
-    """Run the complete CV document reconstruction pipeline.
-
-    Args:
-        pdf_data: Raw PDF bytes (mutually exclusive with image_data).
-        image_data: Raw image bytes (mutually exclusive with pdf_data).
-        page_number: 0-indexed page number (for PDF).
-        zoom: PDF rendering zoom factor.
-        enable_dewarp: Whether to run dewarp stage.
-        enable_llm_correction: Whether to run LLM post-correction.
-        lang: Tesseract language string.
-
-    Returns:
-        PipelineResult with vocabulary and timing info.
-    """
-    if not CV_PIPELINE_AVAILABLE:
-        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
-
-    result = PipelineResult()
-    total_start = time.time()
-
-    try:
-        # Stage 1: Render
-        t = time.time()
-        if pdf_data:
-            img = render_pdf_high_res(pdf_data, page_number, zoom)
-        elif image_data:
-            img = render_image_high_res(image_data)
-        else:
-            return PipelineResult(error="No input data (pdf_data or image_data required)")
-        result.stages['render'] = round(time.time() - t, 2)
-        result.image_width = img.shape[1]
-        result.image_height = img.shape[0]
-        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
-
-        # Stage 2: Deskew
-        t = time.time()
-        img, angle = deskew_image(img)
-        result.stages['deskew'] = round(time.time() - t, 2)
-        logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
-
-        # Stage 3: Dewarp
-        if enable_dewarp:
-            t = time.time()
-            img, _dewarp_info = dewarp_image(img)
-            result.stages['dewarp'] = round(time.time() - t, 2)
-
-        # Stage 4: Dual image preparation
-        t = time.time()
-        ocr_img = create_ocr_image(img)
-        layout_img = create_layout_image(img)
-        result.stages['image_prep'] = round(time.time() - t, 2)
-
-        # Stage 5: Layout analysis
-        t = time.time()
-        regions = analyze_layout(layout_img, ocr_img)
-        result.stages['layout'] = round(time.time() - t, 2)
-        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
-        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
-
-        # Stage 6: Multi-pass OCR
-        t = time.time()
-        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
-        result.stages['ocr'] = round(time.time() - t, 2)
-        total_words = sum(len(w) for w in ocr_results.values())
-        result.word_count = total_words
-        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
-
-        # Stage 7: Line alignment
-        t = time.time()
-        vocab_rows = match_lines_to_vocab(ocr_results, regions)
-        result.stages['alignment'] = round(time.time() - t, 2)
-
-        # Stage 8: Optional LLM correction
-        if enable_llm_correction:
-            t = time.time()
-            vocab_rows = await llm_post_correct(img, vocab_rows)
-            result.stages['llm_correction'] = round(time.time() - t, 2)
-
-        # Convert to output format
-        result.vocabulary = [
-            {
-                "english": row.english,
-                "german": row.german,
-                "example": row.example,
-                "confidence": round(row.confidence, 1),
-            }
-            for row in vocab_rows
-            if row.english or row.german  # Skip empty rows
-        ]
-
-        result.duration_seconds = round(time.time() - total_start, 2)
-        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
-
-    except Exception as e:
-        logger.error(f"CV Pipeline error: {e}")
-        import traceback
-        logger.debug(traceback.format_exc())
-        result.error = str(e)
-        result.duration_seconds = round(time.time() - total_start, 2)
-
-    return result
-
-
-# ---------------------------------------------------------------------------
-# LLM-based OCR Correction (Step 6)
-# ---------------------------------------------------------------------------
-
-import httpx
-import os
-import json as _json
-import re as _re
-
-_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
-OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
-_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
-logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
-
-# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
-_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
-
-# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
-# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
-# when they appear inside or next to a word character.
-_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
-
-
-def _entry_needs_review(entry: Dict) -> bool:
-    """Check if an entry should be sent to the LLM for review.
-
-    Sends all non-empty entries that don't have IPA phonetic transcriptions.
-    The LLM prompt and _is_spurious_change() guard against unwanted changes.
-    """
-    en = entry.get("english", "") or ""
-    de = entry.get("german", "") or ""
-
-    # Skip completely empty entries
-    if not en.strip() and not de.strip():
-        return False
-    # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
-    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
-        return False
-    return True
-
-
-def _build_llm_prompt(table_lines: List[Dict]) -> str:
-    """Build the LLM correction prompt for a batch of entries."""
-    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
-
-DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
-
-NUR diese Korrekturen sind erlaubt:
-- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
-- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
-- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
-- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
-- Ziffer 6 statt G oder g: "6eld" → "Geld"
-- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
-
-ABSOLUT VERBOTEN — aendere NIEMALS:
-- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
-- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
-- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
-- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
-- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
-- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
-- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
-- Beispielsaetze in der ex-Spalte — NIEMALS aendern
-
-Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
-
-Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
-Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
-
-/no_think
-
-Eingabe:
-{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
-
-
-def _is_spurious_change(old_val: str, new_val: str) -> bool:
-    """Detect LLM changes that are likely wrong and should be discarded.
-
-    Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
-    legitimate OCR corrections. Everything else is rejected.
-
-    Filters out:
-    - Case-only changes
-    - Changes that don't contain any digit→letter fix
-    - Completely different words (LLM translating or hallucinating)
-    - Additions or removals of whole words (count changed)
-    """
-    if not old_val or not new_val:
-        return False
-
-    # Case-only change — never a real OCR error
-    if old_val.lower() == new_val.lower():
-        return True
-
-    # If the word count changed significantly, the LLM rewrote rather than fixed
-    old_words = old_val.split()
-    new_words = new_val.split()
-    if abs(len(old_words) - len(new_words)) > 1:
-        return True
-
-    # Core rule: a legitimate correction replaces a digit with the corresponding
-    # letter. If the change doesn't include such a substitution, reject it.
-    # Build a set of (old_char, new_char) pairs that differ between old and new.
-    # Use character-level diff heuristic: if lengths are close, zip and compare.
-    # Map of characters that OCR commonly misreads → set of correct replacements
-    _OCR_CHAR_MAP = {
-        # Digits mistaken for letters
-        '0': set('oOgG'),
-        '1': set('lLiI'),
-        '5': set('sS'),
-        '6': set('gG'),
-        '8': set('bB'),
-        # Non-letter symbols mistaken for letters
-        '|': set('lLiI1'),  # pipe → lowercase l, capital I, or digit 1
-        'l': set('iI|1'),   # lowercase l → capital I (and reverse)
-    }
-    has_valid_fix = False
-    if len(old_val) == len(new_val):
-        for oc, nc in zip(old_val, new_val):
-            if oc != nc:
-                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
-                    has_valid_fix = True
-                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
-                    # Reverse check (e.g. l→I where new is the "correct" char)
-                    has_valid_fix = True
-    else:
-        # Length changed by 1: accept if old had a suspicious char sequence
-        _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
-        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
-            has_valid_fix = True
-
-    if not has_valid_fix:
-        return True  # Reject — looks like translation or hallucination
-
-    return False
-
-
-def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
-    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
-    changes = []
-    entries_out = []
-    for i, orig in enumerate(originals):
-        if i < len(corrected):
-            c = corrected[i]
-            entry = dict(orig)
-            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
-                new_val = c.get(key, "").strip()
-                old_val = (orig.get(field_name, "") or "").strip()
-                if new_val and new_val != old_val:
-                    # Filter spurious LLM changes
-                    if _is_spurious_change(old_val, new_val):
-                        continue
-                    changes.append({
-                        "row_index": orig.get("row_index", i),
-                        "field": field_name,
-                        "old": old_val,
-                        "new": new_val,
-                    })
-                    entry[field_name] = new_val
-                    entry["llm_corrected"] = True
-            entries_out.append(entry)
-        else:
-            entries_out.append(dict(orig))
-    return changes, entries_out
-
-
-# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
-
-REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"
-
-try:
-    from spellchecker import SpellChecker as _SpellChecker
-    _en_spell = _SpellChecker(language='en', distance=1)
-    _de_spell = _SpellChecker(language='de', distance=1)
-    _SPELL_AVAILABLE = True
-    logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
-except ImportError:
-    _SPELL_AVAILABLE = False
-    logger.warning("pyspellchecker not installed — falling back to LLM review")
-
-# ─── Page-Ref Normalization ───────────────────────────────────────────────────
-# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
-_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
-
-
-def _normalize_page_ref(text: str) -> str:
-    """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
-    if not text:
-        return text
-    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
-
-
-# Suspicious OCR chars → ordered list of most-likely correct replacements
-_SPELL_SUBS: Dict[str, List[str]] = {
-    '0': ['O', 'o'],
-    '1': ['l', 'I'],
-    '5': ['S', 's'],
-    '6': ['G', 'g'],
-    '8': ['B', 'b'],
-    '|': ['I', 'l', '1'],
-}
-_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
-
-# Tokenizer: word tokens (letters + pipe) alternating with separators
-_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
-
-
-def _spell_dict_knows(word: str) -> bool:
-    """True if word is known in EN or DE dictionary."""
-    if not _SPELL_AVAILABLE:
-        return False
-    w = word.lower()
-    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
-
-
-def _try_split_merged_word(token: str) -> Optional[str]:
-    """Try to split a merged word like 'atmyschool' into 'at my school'.
-
-    Uses dynamic programming to find the shortest sequence of dictionary
-    words that covers the entire token.  Only returns a result when the
-    split produces at least 2 words and ALL parts are known dictionary words.
-
-    Preserves original capitalisation by mapping back to the input string.
-    """
-    if not _SPELL_AVAILABLE or len(token) < 4:
-        return None
-
-    lower = token.lower()
-    n = len(lower)
-
-    # dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
-    # Score: (-word_count, sum_of_squared_lengths) — fewer words first,
-    # then prefer longer words (e.g. "come on" over "com eon")
-    dp: list = [None] * (n + 1)
-    dp[0] = ([], 0)
-
-    for i in range(1, n + 1):
-        for j in range(max(0, i - 20), i):
-            if dp[j] is None:
-                continue
-            candidate = lower[j:i]
-            word_len = i - j
-            if word_len == 1 and candidate not in ('a', 'i'):
-                continue
-            if _spell_dict_knows(candidate):
-                prev_words, prev_sq = dp[j]
-                new_words = prev_words + [word_len]
-                new_sq = prev_sq + word_len * word_len
-                new_key = (-len(new_words), new_sq)
-                if dp[i] is None:
-                    dp[i] = (new_words, new_sq)
-                else:
-                    old_key = (-len(dp[i][0]), dp[i][1])
-                    if new_key >= old_key:
-                        # >= so that later splits (longer first word) win ties
-                        dp[i] = (new_words, new_sq)
-
-    if dp[n] is None or len(dp[n][0]) < 2:
-        return None
-
-    # Reconstruct with original casing
-    result = []
-    pos = 0
-    for wlen in dp[n][0]:
-        result.append(token[pos:pos + wlen])
-        pos += wlen
-
-    logger.debug("Split merged word: %r → %r", token, " ".join(result))
-    return " ".join(result)
-
-
-def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
-    """Return corrected form of token, or None if no fix needed/possible.
-
-    *field* is 'english' or 'german' — used to pick the right dictionary
-    for general spell correction (step 3 below).
-    """
-    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
-
-    # 1. Already known word → no fix needed
-    if _spell_dict_knows(token):
-        return None
-
-    # 2. Digit/pipe substitution (existing logic)
-    if has_suspicious:
-        # Standalone pipe → capital I
-        if token == '|':
-            return 'I'
-        # Dictionary-backed single-char substitution
-        for i, ch in enumerate(token):
-            if ch not in _SPELL_SUBS:
-                continue
-            for replacement in _SPELL_SUBS[ch]:
-                candidate = token[:i] + replacement + token[i + 1:]
-                if _spell_dict_knows(candidate):
-                    return candidate
-        # Structural rule: suspicious char at position 0 + rest is all lowercase letters
-        first = token[0]
-        if first in _SPELL_SUBS and len(token) >= 2:
-            rest = token[1:]
-            if rest.isalpha() and rest.islower():
-                candidate = _SPELL_SUBS[first][0] + rest
-                if not candidate[0].isdigit():
-                    return candidate
-
-    # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
-    #    Try single-char umlaut substitutions and check against dictionary.
-    if len(token) >= 3 and token.isalpha() and field == "german":
-        _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
-                         'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
-        for i, ch in enumerate(token):
-            if ch in _UMLAUT_SUBS:
-                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
-                if _spell_dict_knows(candidate):
-                    return candidate
-
-    # 4. General spell correction for unknown words (no digits/pipes)
-    #    e.g. "beautful" → "beautiful"
-    if not has_suspicious and len(token) >= 3 and token.isalpha():
-        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
-        if spell is not None:
-            correction = spell.correction(token.lower())
-            if correction and correction != token.lower():
-                # Preserve original capitalisation pattern
-                if token[0].isupper():
-                    correction = correction[0].upper() + correction[1:]
-                if _spell_dict_knows(correction):
-                    return correction
-
-    # 5. Merged-word split: OCR often merges adjacent words when spacing
-    #    is too tight, e.g. "atmyschool" → "at my school"
-    if len(token) >= 4 and token.isalpha():
-        split = _try_split_merged_word(token)
-        if split:
-            return split
-
-    return None
-
-
-def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
-    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
-
-    *field* is 'english' or 'german' — forwarded to _spell_fix_token for
-    dictionary selection.
-    """
-    if not text:
-        return text, False
-    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
-    # If no suspicious chars AND no alpha chars that could be misspelled, skip
-    if not has_suspicious and not any(c.isalpha() for c in text):
-        return text, False
-    # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
-    fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
-    changed = fixed != text
-    # Tokenize and fix word by word
-    parts: List[str] = []
-    pos = 0
-    for m in _SPELL_TOKEN_RE.finditer(fixed):
-        token, sep = m.group(1), m.group(2)
-        correction = _spell_fix_token(token, field=field)
-        if correction:
-            parts.append(correction)
-            changed = True
-        else:
-            parts.append(token)
-        parts.append(sep)
-        pos = m.end()
-    if pos < len(fixed):
-        parts.append(fixed[pos:])
-    return ''.join(parts), changed
-
-
-def spell_review_entries_sync(entries: List[Dict]) -> Dict:
-    """Rule-based OCR correction: spell-checker + structural heuristics.
-
-    Deterministic — never translates, never touches IPA, never hallucinates.
-    Uses SmartSpellChecker for language-aware corrections with context-based
-    disambiguation (a/I), multi-digit substitution, and cross-language guard.
-    """
-    t0 = time.time()
-    changes: List[Dict] = []
-    all_corrected: List[Dict] = []
-
-    # Use SmartSpellChecker if available, fall back to legacy _spell_fix_field
-    _smart = None
-    try:
-        from smart_spell import SmartSpellChecker
-        _smart = SmartSpellChecker()
-        logger.debug("spell_review: using SmartSpellChecker")
-    except Exception:
-        logger.debug("spell_review: SmartSpellChecker not available, using legacy")
-
-    # Map field names → language codes for SmartSpellChecker
-    _LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
-
-    for i, entry in enumerate(entries):
-        e = dict(entry)
-        # Page-ref normalization (always, regardless of review status)
-        old_ref = (e.get("source_page") or "").strip()
-        if old_ref:
-            new_ref = _normalize_page_ref(old_ref)
-            if new_ref != old_ref:
-                changes.append({
-                    "row_index": e.get("row_index", i),
-                    "field": "source_page",
-                    "old": old_ref,
-                    "new": new_ref,
-                })
-                e["source_page"] = new_ref
-                e["llm_corrected"] = True
-        if not _entry_needs_review(e):
-            all_corrected.append(e)
-            continue
-        for field_name in ("english", "german", "example"):
-            old_val = (e.get(field_name) or "").strip()
-            if not old_val:
-                continue
-
-            if _smart:
-                # SmartSpellChecker path — language-aware, context-based
-                lang_code = _LANG_MAP.get(field_name, "en")
-                result = _smart.correct_text(old_val, lang=lang_code)
-                new_val = result.corrected
-                was_changed = result.changed
-            else:
-                # Legacy path
-                lang = "german" if field_name in ("german", "example") else "english"
-                new_val, was_changed = _spell_fix_field(old_val, field=lang)
-
-            if was_changed and new_val != old_val:
-                changes.append({
-                    "row_index": e.get("row_index", i),
-                    "field": field_name,
-                    "old": old_val,
-                    "new": new_val,
-                })
-                e[field_name] = new_val
-                e["llm_corrected"] = True
-        all_corrected.append(e)
-    duration_ms = int((time.time() - t0) * 1000)
-    model_name = "smart-spell-checker" if _smart else "spell-checker"
-    return {
-        "entries_original": entries,
-        "entries_corrected": all_corrected,
-        "changes": changes,
-        "skipped_count": 0,
-        "model_used": model_name,
-        "duration_ms": duration_ms,
-    }
-
-
-async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
-    """Async generator yielding SSE-compatible events for spell-checker review."""
-    total = len(entries)
-    yield {
-        "type": "meta",
-        "total_entries": total,
-        "to_review": total,
-        "skipped": 0,
-        "model": "spell-checker",
-        "batch_size": batch_size,
-    }
-    result = spell_review_entries_sync(entries)
-    changes = result["changes"]
-    yield {
-        "type": "batch",
-        "batch_index": 0,
-        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
-        "changes": changes,
-        "duration_ms": result["duration_ms"],
-        "progress": {"current": total, "total": total},
-    }
-    yield {
-        "type": "complete",
-        "changes": changes,
-        "model_used": "spell-checker",
-        "duration_ms": result["duration_ms"],
-        "total_entries": total,
-        "reviewed": total,
-        "skipped": 0,
-        "corrections_found": len(changes),
-        "entries_corrected": result["entries_corrected"],
-    }
-
-# ─── End Spell-Checker ────────────────────────────────────────────────────────
-
-
-async def llm_review_entries(
-    entries: List[Dict],
-    model: str = None,
-) -> Dict:
-    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
-    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
-        return spell_review_entries_sync(entries)
-    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
-        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
-    model = model or OLLAMA_REVIEW_MODEL
-
-    # Filter: only entries that need review
-    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
-
-    if not reviewable:
-        return {
-            "entries_original": entries,
-            "entries_corrected": [dict(e) for e in entries],
-            "changes": [],
-            "skipped_count": len(entries),
-            "model_used": model,
-            "duration_ms": 0,
-        }
-
-    review_entries = [e for _, e in reviewable]
-    table_lines = [
-        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
-        for e in review_entries
-    ]
-
-    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
-                len(review_entries), len(entries), model, len(entries) - len(reviewable))
-    logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
-
-    prompt = _build_llm_prompt(table_lines)
-
-    t0 = time.time()
-    async with httpx.AsyncClient(timeout=300.0) as client:
-        resp = await client.post(
-            f"{_OLLAMA_URL}/api/chat",
-            json={
-                "model": model,
-                "messages": [{"role": "user", "content": prompt}],
-                "stream": False,
-                "think": False,   # qwen3: disable chain-of-thought (Ollama >=0.6)
-                "options": {"temperature": 0.1, "num_predict": 8192},
-            },
-        )
-        resp.raise_for_status()
-        content = resp.json().get("message", {}).get("content", "")
-    duration_ms = int((time.time() - t0) * 1000)
-
-    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
-    logger.debug("LLM review raw response (first 500): %.500s", content)
-
-    corrected = _parse_llm_json_array(content)
-    logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
-    changes, corrected_entries = _diff_batch(review_entries, corrected)
-
-    # Merge corrected entries back into the full list
-    all_corrected = [dict(e) for e in entries]
-    for batch_idx, (orig_idx, _) in enumerate(reviewable):
-        if batch_idx < len(corrected_entries):
-            all_corrected[orig_idx] = corrected_entries[batch_idx]
-
-    return {
-        "entries_original": entries,
-        "entries_corrected": all_corrected,
-        "changes": changes,
-        "skipped_count": len(entries) - len(reviewable),
-        "model_used": model,
-        "duration_ms": duration_ms,
-    }
-
-
-async def llm_review_entries_streaming(
-    entries: List[Dict],
-    model: str = None,
-    batch_size: int = _REVIEW_BATCH_SIZE,
-):
-    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
-
-    Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
-    visible in the UI — this is the only place the fix now runs (removed from Step 1
-    of build_vocab_pipeline_streaming).
-    """
-    # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
-    _CONF_FIELDS = ('english', 'german', 'example')
-    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
-    _fix_character_confusion(entries)  # modifies in-place, returns same list
-    char_changes = [
-        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
-        for i in range(len(entries))
-        for f in _CONF_FIELDS
-        if originals[i][f] != entries[i].get(f, '')
-    ]
-
-    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
-        # Inject char_changes as a batch right after the meta event from the spell checker
-        _meta_sent = False
-        async for event in spell_review_entries_streaming(entries, batch_size):
-            yield event
-            if not _meta_sent and event.get('type') == 'meta' and char_changes:
-                _meta_sent = True
-                yield {
-                    'type': 'batch',
-                    'changes': char_changes,
-                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
-                    'progress': {'current': 0, 'total': len(entries)},
-                }
-        return
-
-    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
-        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
-    # LLM path: emit char_changes first (before meta) so they appear in the UI
-    if char_changes:
-        yield {
-            'type': 'batch',
-            'changes': char_changes,
-            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
-            'progress': {'current': 0, 'total': len(entries)},
-        }
-
-    model = model or OLLAMA_REVIEW_MODEL
-
-    # Separate reviewable from skipped entries
-    reviewable = []
-    skipped_indices = []
-    for i, e in enumerate(entries):
-        if _entry_needs_review(e):
-            reviewable.append((i, e))
-        else:
-            skipped_indices.append(i)
-
-    total_to_review = len(reviewable)
-
-    # meta event
-    yield {
-        "type": "meta",
-        "total_entries": len(entries),
-        "to_review": total_to_review,
-        "skipped": len(skipped_indices),
-        "model": model,
-        "batch_size": batch_size,
-    }
-
-    all_changes = []
-    all_corrected = [dict(e) for e in entries]
-    total_duration_ms = 0
-    reviewed_count = 0
-
-    # Process in batches
-    for batch_start in range(0, total_to_review, batch_size):
-        batch_items = reviewable[batch_start:batch_start + batch_size]
-        batch_entries = [e for _, e in batch_items]
-
-        table_lines = [
-            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
-            for e in batch_entries
-        ]
-
-        prompt = _build_llm_prompt(table_lines)
-
-        logger.info("LLM review streaming: batch %d — sending %d entries to %s",
-                    batch_start // batch_size, len(batch_entries), model)
-
-        t0 = time.time()
-        async with httpx.AsyncClient(timeout=300.0) as client:
-            resp = await client.post(
-                f"{_OLLAMA_URL}/api/chat",
-                json={
-                    "model": model,
-                    "messages": [{"role": "user", "content": prompt}],
-                    "stream": False,
-                    "think": False,   # qwen3: disable chain-of-thought
-                    "options": {"temperature": 0.1, "num_predict": 8192},
-                },
-            )
-            resp.raise_for_status()
-            content = resp.json().get("message", {}).get("content", "")
-        batch_ms = int((time.time() - t0) * 1000)
-        total_duration_ms += batch_ms
-
-        logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
-        logger.debug("LLM review streaming raw (first 500): %.500s", content)
-
-        corrected = _parse_llm_json_array(content)
-        logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
-        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
-
-        # Merge back
-        for batch_idx, (orig_idx, _) in enumerate(batch_items):
-            if batch_idx < len(batch_corrected):
-                all_corrected[orig_idx] = batch_corrected[batch_idx]
-
-        all_changes.extend(batch_changes)
-        reviewed_count += len(batch_items)
-
-        # Yield batch result
-        yield {
-            "type": "batch",
-            "batch_index": batch_start // batch_size,
-            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
-            "changes": batch_changes,
-            "duration_ms": batch_ms,
-            "progress": {"current": reviewed_count, "total": total_to_review},
-        }
-
-    # Complete event
-    yield {
-        "type": "complete",
-        "changes": all_changes,
-        "model_used": model,
-        "duration_ms": total_duration_ms,
-        "total_entries": len(entries),
-        "reviewed": total_to_review,
-        "skipped": len(skipped_indices),
-        "corrections_found": len(all_changes),
-        "entries_corrected": all_corrected,
-    }
-
-
-def _sanitize_for_json(text: str) -> str:
-    """Remove or escape control characters that break JSON parsing.
-
-    Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
-    JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
-    that are only valid inside JSON strings when properly escaped.
-    """
-    # Replace literal control chars (except \\t \\n \\r) with a space
-    return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
-
-
-def _parse_llm_json_array(text: str) -> List[Dict]:
-    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
-    # Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
-    text = _re.sub(r'<think>.*?</think>', '', text, flags=_re.DOTALL)
-    # Strip markdown code fences
-    text = _re.sub(r'```json\s*', '', text)
-    text = _re.sub(r'```\s*', '', text)
-    # Sanitize control characters before JSON parsing
-    text = _sanitize_for_json(text)
-    # Find first [ ... last ]
-    match = _re.search(r'\[.*\]', text, _re.DOTALL)
-    if match:
-        try:
-            return _json.loads(match.group())
-        except (ValueError, _json.JSONDecodeError) as e:
-            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
-    else:
-        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
-    return []
+from cv_review_spell import (  # noqa: F401
+    _SPELL_AVAILABLE,
+    _spell_dict_knows,
+    _spell_fix_field,
+    _spell_fix_token,
+    _try_split_merged_word,
+    _normalize_page_ref,
+    spell_review_entries_sync,
+    spell_review_entries_streaming,
+)
+
+from cv_review_llm import (  # noqa: F401
+    OLLAMA_REVIEW_MODEL,
+    REVIEW_ENGINE,
+    _REVIEW_BATCH_SIZE,
+    _build_llm_prompt,
+    _diff_batch,
+    _entry_needs_review,
+    _is_spurious_change,
+    _parse_llm_json_array,
+    _sanitize_for_json,
+    llm_review_entries,
+    llm_review_entries_streaming,
+)
diff --git a/klausur-service/backend/cv_review_llm.py b/klausur-service/backend/cv_review_llm.py
new file mode 100644
index 0000000..dc3b288
--- /dev/null
+++ b/klausur-service/backend/cv_review_llm.py
@@ -0,0 +1,388 @@
+"""
+CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming.
+
+Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
+_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re
+import time
+from typing import Dict, List, Tuple
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
+_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
+logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
+
+REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell")   # "spell" (default) | "llm"
+
+# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
+_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')
+
+# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
+_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')
+
+
+def _entry_needs_review(entry: Dict) -> bool:
+    """Check if an entry should be sent for review.
+
+    Sends all non-empty entries that don't have IPA phonetic transcriptions.
+    """
+    en = entry.get("english", "") or ""
+    de = entry.get("german", "") or ""
+
+    if not en.strip() and not de.strip():
+        return False
+    if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
+        return False
+    return True
+
+
+def _build_llm_prompt(table_lines: List[Dict]) -> str:
+    """Build the LLM correction prompt for a batch of entries."""
+    return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
+
+DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
+
+NUR diese Korrekturen sind erlaubt:
+- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
+- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
+- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
+- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
+- Ziffer 6 statt G oder g: "6eld" -> "Geld"
+- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"
+
+ABSOLUT VERBOTEN -- aendere NIEMALS:
+- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
+- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
+- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
+- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
+- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
+- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
+- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
+- Beispielsaetze in der ex-Spalte -- NIEMALS aendern
+
+Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
+
+Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
+Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
+
+/no_think
+
+Eingabe:
+{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
+
+
+def _is_spurious_change(old_val: str, new_val: str) -> bool:
+    """Detect LLM changes that are likely wrong and should be discarded.
+
+    Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
+    legitimate OCR corrections. Everything else is rejected.
+    """
+    if not old_val or not new_val:
+        return False
+
+    if old_val.lower() == new_val.lower():
+        return True
+
+    old_words = old_val.split()
+    new_words = new_val.split()
+    if abs(len(old_words) - len(new_words)) > 1:
+        return True
+
+    _OCR_CHAR_MAP = {
+        '0': set('oOgG'),
+        '1': set('lLiI'),
+        '5': set('sS'),
+        '6': set('gG'),
+        '8': set('bB'),
+        '|': set('lLiI1'),
+        'l': set('iI|1'),
+    }
+    has_valid_fix = False
+    if len(old_val) == len(new_val):
+        for oc, nc in zip(old_val, new_val):
+            if oc != nc:
+                if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
+                    has_valid_fix = True
+                elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
+                    has_valid_fix = True
+    else:
+        _OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
+        if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
+            has_valid_fix = True
+
+    if not has_valid_fix:
+        return True
+
+    return False
+
+
+def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
+    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
+    changes = []
+    entries_out = []
+    for i, orig in enumerate(originals):
+        if i < len(corrected):
+            c = corrected[i]
+            entry = dict(orig)
+            for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
+                new_val = c.get(key, "").strip()
+                old_val = (orig.get(field_name, "") or "").strip()
+                if new_val and new_val != old_val:
+                    if _is_spurious_change(old_val, new_val):
+                        continue
+                    changes.append({
+                        "row_index": orig.get("row_index", i),
+                        "field": field_name,
+                        "old": old_val,
+                        "new": new_val,
+                    })
+                    entry[field_name] = new_val
+                    entry["llm_corrected"] = True
+            entries_out.append(entry)
+        else:
+            entries_out.append(dict(orig))
+    return changes, entries_out
+
+
+def _sanitize_for_json(text: str) -> str:
+    """Remove or escape control characters that break JSON parsing."""
+    return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
+
+
+def _parse_llm_json_array(text: str) -> List[Dict]:
+    """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    text = re.sub(r'```json\s*', '', text)
+    text = re.sub(r'```\s*', '', text)
+    text = _sanitize_for_json(text)
+    match = re.search(r'\[.*\]', text, re.DOTALL)
+    if match:
+        try:
+            return json.loads(match.group())
+        except (ValueError, json.JSONDecodeError) as e:
+            logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
+    else:
+        logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
+    return []
+
+
+async def llm_review_entries(
+    entries: List[Dict],
+    model: str = None,
+) -> Dict:
+    """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
+    from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
+
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        return spell_review_entries_sync(entries)
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
+
+    if not reviewable:
+        return {
+            "entries_original": entries,
+            "entries_corrected": [dict(e) for e in entries],
+            "changes": [],
+            "skipped_count": len(entries),
+            "model_used": model,
+            "duration_ms": 0,
+        }
+
+    review_entries = [e for _, e in reviewable]
+    table_lines = [
+        {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+        for e in review_entries
+    ]
+
+    logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
+                len(review_entries), len(entries), model, len(entries) - len(reviewable))
+
+    prompt = _build_llm_prompt(table_lines)
+
+    t0 = time.time()
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        resp = await client.post(
+            f"{_OLLAMA_URL}/api/chat",
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "stream": False,
+                "think": False,
+                "options": {"temperature": 0.1, "num_predict": 8192},
+            },
+        )
+        resp.raise_for_status()
+        content = resp.json().get("message", {}).get("content", "")
+    duration_ms = int((time.time() - t0) * 1000)
+
+    logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
+
+    corrected = _parse_llm_json_array(content)
+    changes, corrected_entries = _diff_batch(review_entries, corrected)
+
+    all_corrected = [dict(e) for e in entries]
+    for batch_idx, (orig_idx, _) in enumerate(reviewable):
+        if batch_idx < len(corrected_entries):
+            all_corrected[orig_idx] = corrected_entries[batch_idx]
+
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": len(entries) - len(reviewable),
+        "model_used": model,
+        "duration_ms": duration_ms,
+    }
+
+
+async def llm_review_entries_streaming(
+    entries: List[Dict],
+    model: str = None,
+    batch_size: int = _REVIEW_BATCH_SIZE,
+):
+    """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
+
+    Phase 0 (always): Run _fix_character_confusion and emit any changes.
+    """
+    from cv_ocr_engines import _fix_character_confusion
+    from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
+
+    _CONF_FIELDS = ('english', 'german', 'example')
+    originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
+    _fix_character_confusion(entries)
+    char_changes = [
+        {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
+        for i in range(len(entries))
+        for f in _CONF_FIELDS
+        if originals[i][f] != entries[i].get(f, '')
+    ]
+
+    if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+        _meta_sent = False
+        async for event in spell_review_entries_streaming(entries, batch_size):
+            yield event
+            if not _meta_sent and event.get('type') == 'meta' and char_changes:
+                _meta_sent = True
+                yield {
+                    'type': 'batch',
+                    'changes': char_changes,
+                    'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+                    'progress': {'current': 0, 'total': len(entries)},
+                }
+        return
+
+    if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+        logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+    # LLM path
+    if char_changes:
+        yield {
+            'type': 'batch',
+            'changes': char_changes,
+            'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+            'progress': {'current': 0, 'total': len(entries)},
+        }
+
+    model = model or OLLAMA_REVIEW_MODEL
+
+    reviewable = []
+    skipped_indices = []
+    for i, e in enumerate(entries):
+        if _entry_needs_review(e):
+            reviewable.append((i, e))
+        else:
+            skipped_indices.append(i)
+
+    total_to_review = len(reviewable)
+
+    yield {
+        "type": "meta",
+        "total_entries": len(entries),
+        "to_review": total_to_review,
+        "skipped": len(skipped_indices),
+        "model": model,
+        "batch_size": batch_size,
+    }
+
+    all_changes = []
+    all_corrected = [dict(e) for e in entries]
+    total_duration_ms = 0
+    reviewed_count = 0
+
+    for batch_start in range(0, total_to_review, batch_size):
+        batch_items = reviewable[batch_start:batch_start + batch_size]
+        batch_entries = [e for _, e in batch_items]
+
+        table_lines = [
+            {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+            for e in batch_entries
+        ]
+
+        prompt = _build_llm_prompt(table_lines)
+
+        logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
+                    batch_start // batch_size, len(batch_entries), model)
+
+        t0 = time.time()
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            resp = await client.post(
+                f"{_OLLAMA_URL}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": False,
+                    "think": False,
+                    "options": {"temperature": 0.1, "num_predict": 8192},
+                },
+            )
+            resp.raise_for_status()
+            content = resp.json().get("message", {}).get("content", "")
+        batch_ms = int((time.time() - t0) * 1000)
+        total_duration_ms += batch_ms
+
+        corrected = _parse_llm_json_array(content)
+        batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
+
+        for batch_idx, (orig_idx, _) in enumerate(batch_items):
+            if batch_idx < len(batch_corrected):
+                all_corrected[orig_idx] = batch_corrected[batch_idx]
+
+        all_changes.extend(batch_changes)
+        reviewed_count += len(batch_items)
+
+        yield {
+            "type": "batch",
+            "batch_index": batch_start // batch_size,
+            "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
+            "changes": batch_changes,
+            "duration_ms": batch_ms,
+            "progress": {"current": reviewed_count, "total": total_to_review},
+        }
+
+    yield {
+        "type": "complete",
+        "changes": all_changes,
+        "model_used": model,
+        "duration_ms": total_duration_ms,
+        "total_entries": len(entries),
+        "reviewed": total_to_review,
+        "skipped": len(skipped_indices),
+        "corrections_found": len(all_changes),
+        "entries_corrected": all_corrected,
+    }
diff --git a/klausur-service/backend/cv_review_pipeline.py b/klausur-service/backend/cv_review_pipeline.py
new file mode 100644
index 0000000..746b45c
--- /dev/null
+++ b/klausur-service/backend/cv_review_pipeline.py
@@ -0,0 +1,430 @@
+"""
+CV Review Pipeline — Multi-pass OCR, line alignment, LLM post-correction, and orchestration.
+
+Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import time
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV_PIPELINE_AVAILABLE,
+    PageRegion,
+    PipelineResult,
+    VocabRow,
+)
+from cv_preprocessing import (
+    deskew_image,
+    dewarp_image,
+    render_image_high_res,
+    render_pdf_high_res,
+)
+from cv_layout import (
+    analyze_layout,
+    create_layout_image,
+    create_ocr_image,
+)
+from cv_ocr_engines import (
+    _group_words_into_lines,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Stage 6: Multi-Pass OCR
+# =============================================================================
+
+def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
+               psm: int, fallback_psm: Optional[int] = None,
+               min_confidence: float = 40.0) -> List[Dict[str, Any]]:
+    """Run Tesseract OCR on a specific region with given PSM.
+
+    Args:
+        ocr_img: Binarized full-page image.
+        region: Region to crop and OCR.
+        lang: Tesseract language string.
+        psm: Page Segmentation Mode.
+        fallback_psm: If confidence too low, retry with this PSM per line.
+        min_confidence: Minimum average confidence before fallback.
+
+    Returns:
+        List of word dicts with text, position, confidence.
+    """
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    pil_img = Image.fromarray(crop)
+
+    config = f'--psm {psm} --oem 3'
+    try:
+        data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                         output_type=pytesseract.Output.DICT)
+    except Exception as e:
+        logger.warning(f"Tesseract failed for region {region.type}: {e}")
+        return []
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 10:
+            continue
+        words.append({
+            'text': text,
+            'left': data['left'][i] + region.x,
+            'top': data['top'][i] + region.y,
+            'width': data['width'][i],
+            'height': data['height'][i],
+            'conf': conf,
+            'region_type': region.type,
+        })
+
+    if words and fallback_psm is not None:
+        avg_conf = sum(w['conf'] for w in words) / len(words)
+        if avg_conf < min_confidence:
+            logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
+                        f"trying fallback PSM {fallback_psm}")
+            words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
+
+    return words
+
+
+def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
+                              lang: str, psm: int) -> List[Dict[str, Any]]:
+    """OCR a region line by line (fallback for low-confidence regions)."""
+    crop = ocr_img[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    inv = cv2.bitwise_not(crop)
+    h_proj = np.sum(inv, axis=1)
+    threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
+
+    lines = []
+    in_text = False
+    line_start = 0
+    for y in range(len(h_proj)):
+        if h_proj[y] > threshold and not in_text:
+            line_start = y
+            in_text = True
+        elif h_proj[y] <= threshold and in_text:
+            if y - line_start > 5:
+                lines.append((line_start, y))
+            in_text = False
+    if in_text and len(h_proj) - line_start > 5:
+        lines.append((line_start, len(h_proj)))
+
+    all_words = []
+    config = f'--psm {psm} --oem 3'
+
+    for line_y_start, line_y_end in lines:
+        pad = 3
+        y1 = max(0, line_y_start - pad)
+        y2 = min(crop.shape[0], line_y_end + pad)
+        line_crop = crop[y1:y2, :]
+
+        if line_crop.size == 0:
+            continue
+
+        pil_img = Image.fromarray(line_crop)
+        try:
+            data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+                                             output_type=pytesseract.Output.DICT)
+        except Exception:
+            continue
+
+        for i in range(len(data['text'])):
+            text = data['text'][i].strip()
+            conf = int(data['conf'][i])
+            if not text or conf < 10:
+                continue
+            all_words.append({
+                'text': text,
+                'left': data['left'][i] + region.x,
+                'top': data['top'][i] + region.y + y1,
+                'width': data['width'][i],
+                'height': data['height'][i],
+                'conf': conf,
+                'region_type': region.type,
+            })
+
+    return all_words
+
+
+def run_multi_pass_ocr(ocr_img: np.ndarray,
+                       regions: List[PageRegion],
+                       lang: str = "eng+deu") -> Dict[str, List[Dict]]:
+    """Run OCR on each detected region with optimized settings."""
+    results: Dict[str, List[Dict]] = {}
+
+    _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+    for region in regions:
+        if region.type in _ocr_skip:
+            continue
+
+        if region.type == 'column_en':
+            words = ocr_region(ocr_img, region, lang='eng', psm=4)
+        elif region.type == 'column_de':
+            words = ocr_region(ocr_img, region, lang='deu', psm=4)
+        elif region.type == 'column_example':
+            words = ocr_region(ocr_img, region, lang=lang, psm=6,
+                              fallback_psm=7, min_confidence=40.0)
+        else:
+            words = ocr_region(ocr_img, region, lang=lang, psm=6)
+
+        results[region.type] = words
+        logger.info(f"OCR {region.type}: {len(words)} words")
+
+    return results
+
+
+# =============================================================================
+# Stage 7: Line Alignment -> Vocabulary Entries
+# =============================================================================
+
+def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
+                          regions: List[PageRegion],
+                          y_tolerance_px: int = 25) -> List[VocabRow]:
+    """Align OCR results from different columns into vocabulary rows."""
+    if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
+        logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
+        return []
+
+    en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
+    de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
+    ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
+
+    def line_y_center(line: List[Dict]) -> float:
+        return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
+
+    def line_text(line: List[Dict]) -> str:
+        return ' '.join(w['text'] for w in line)
+
+    def line_confidence(line: List[Dict]) -> float:
+        return sum(w['conf'] for w in line) / len(line) if line else 0
+
+    vocab_rows: List[VocabRow] = []
+
+    for en_line in en_lines:
+        en_y = line_y_center(en_line)
+        en_text = line_text(en_line)
+        en_conf = line_confidence(en_line)
+
+        if len(en_text.strip()) < 2:
+            continue
+
+        de_text = ""
+        de_conf = 0.0
+        best_de_dist = float('inf')
+        best_de_idx = -1
+        for idx, de_line in enumerate(de_lines):
+            dist = abs(line_y_center(de_line) - en_y)
+            if dist < y_tolerance_px and dist < best_de_dist:
+                best_de_dist = dist
+                best_de_idx = idx
+
+        if best_de_idx >= 0:
+            de_text = line_text(de_lines[best_de_idx])
+            de_conf = line_confidence(de_lines[best_de_idx])
+
+        ex_text = ""
+        ex_conf = 0.0
+        best_ex_dist = float('inf')
+        best_ex_idx = -1
+        for idx, ex_line in enumerate(ex_lines):
+            dist = abs(line_y_center(ex_line) - en_y)
+            if dist < y_tolerance_px and dist < best_ex_dist:
+                best_ex_dist = dist
+                best_ex_idx = idx
+
+        if best_ex_idx >= 0:
+            ex_text = line_text(ex_lines[best_ex_idx])
+            ex_conf = line_confidence(ex_lines[best_ex_idx])
+
+        avg_conf = en_conf
+        conf_count = 1
+        if de_conf > 0:
+            avg_conf += de_conf
+            conf_count += 1
+        if ex_conf > 0:
+            avg_conf += ex_conf
+            conf_count += 1
+
+        vocab_rows.append(VocabRow(
+            english=en_text.strip(),
+            german=de_text.strip(),
+            example=ex_text.strip(),
+            confidence=avg_conf / conf_count,
+            y_position=int(en_y),
+        ))
+
+    # Handle multi-line wrapping in example column
+    matched_ex_ys = set()
+    for row in vocab_rows:
+        if row.example:
+            matched_ex_ys.add(row.y_position)
+
+    for ex_line in ex_lines:
+        ex_y = line_y_center(ex_line)
+        already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
+        if already_matched:
+            continue
+
+        best_row = None
+        best_dist = float('inf')
+        for row in vocab_rows:
+            dist = ex_y - row.y_position
+            if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
+                best_dist = dist
+                best_row = row
+
+        if best_row:
+            continuation = line_text(ex_line).strip()
+            if continuation:
+                best_row.example = (best_row.example + " " + continuation).strip()
+
+    vocab_rows.sort(key=lambda r: r.y_position)
+
+    return vocab_rows
+
+
+# =============================================================================
+# Stage 8: Optional LLM Post-Correction
+# =============================================================================
+
+async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
+                           confidence_threshold: float = 50.0,
+                           enabled: bool = False) -> List[VocabRow]:
+    """Optionally send low-confidence regions to Qwen-VL for correction."""
+    if not enabled:
+        return vocab_rows
+
+    logger.info(f"LLM post-correction skipped (not yet implemented)")
+    return vocab_rows
+
+
+# =============================================================================
+# Orchestrator
+# =============================================================================
+
+async def run_cv_pipeline(
+    pdf_data: Optional[bytes] = None,
+    image_data: Optional[bytes] = None,
+    page_number: int = 0,
+    zoom: float = 3.0,
+    enable_dewarp: bool = True,
+    enable_llm_correction: bool = False,
+    lang: str = "eng+deu",
+) -> PipelineResult:
+    """Run the complete CV document reconstruction pipeline."""
+    if not CV_PIPELINE_AVAILABLE:
+        return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
+
+    result = PipelineResult()
+    total_start = time.time()
+
+    try:
+        # Stage 1: Render
+        t = time.time()
+        if pdf_data:
+            img = render_pdf_high_res(pdf_data, page_number, zoom)
+        elif image_data:
+            img = render_image_high_res(image_data)
+        else:
+            return PipelineResult(error="No input data (pdf_data or image_data required)")
+        result.stages['render'] = round(time.time() - t, 2)
+        result.image_width = img.shape[1]
+        result.image_height = img.shape[0]
+        logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
+
+        # Stage 2: Deskew
+        t = time.time()
+        img, angle = deskew_image(img)
+        result.stages['deskew'] = round(time.time() - t, 2)
+        logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s")
+
+        # Stage 3: Dewarp
+        if enable_dewarp:
+            t = time.time()
+            img, _dewarp_info = dewarp_image(img)
+            result.stages['dewarp'] = round(time.time() - t, 2)
+
+        # Stage 4: Dual image preparation
+        t = time.time()
+        ocr_img = create_ocr_image(img)
+        layout_img = create_layout_image(img)
+        result.stages['image_prep'] = round(time.time() - t, 2)
+
+        # Stage 5: Layout analysis
+        t = time.time()
+        regions = analyze_layout(layout_img, ocr_img)
+        result.stages['layout'] = round(time.time() - t, 2)
+        result.columns_detected = len([r for r in regions if r.type.startswith('column')])
+        logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
+
+        # Stage 6: Multi-pass OCR
+        t = time.time()
+        ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
+        result.stages['ocr'] = round(time.time() - t, 2)
+        total_words = sum(len(w) for w in ocr_results.values())
+        result.word_count = total_words
+        logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
+
+        # Stage 7: Line alignment
+        t = time.time()
+        vocab_rows = match_lines_to_vocab(ocr_results, regions)
+        result.stages['alignment'] = round(time.time() - t, 2)
+
+        # Stage 8: Optional LLM correction
+        if enable_llm_correction:
+            t = time.time()
+            vocab_rows = await llm_post_correct(img, vocab_rows)
+            result.stages['llm_correction'] = round(time.time() - t, 2)
+
+        # Convert to output format
+        result.vocabulary = [
+            {
+                "english": row.english,
+                "german": row.german,
+                "example": row.example,
+                "confidence": round(row.confidence, 1),
+            }
+            for row in vocab_rows
+            if row.english or row.german
+        ]
+
+        result.duration_seconds = round(time.time() - total_start, 2)
+        logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
+
+    except Exception as e:
+        logger.error(f"CV Pipeline error: {e}")
+        import traceback
+        logger.debug(traceback.format_exc())
+        result.error = str(e)
+        result.duration_seconds = round(time.time() - total_start, 2)
+
+    return result
diff --git a/klausur-service/backend/cv_review_spell.py b/klausur-service/backend/cv_review_spell.py
new file mode 100644
index 0000000..5398a21
--- /dev/null
+++ b/klausur-service/backend/cv_review_spell.py
@@ -0,0 +1,315 @@
+"""
+CV Review Spell — Rule-based OCR spell correction (no LLM).
+
+Provides dictionary-backed digit-to-letter substitution, umlaut correction,
+general spell correction, merged-word splitting, and page-ref normalization.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+import time
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+try:
+    from spellchecker import SpellChecker as _SpellChecker
+    _en_spell = _SpellChecker(language='en', distance=1)
+    _de_spell = _SpellChecker(language='de', distance=1)
+    _SPELL_AVAILABLE = True
+    logger.info("pyspellchecker loaded (EN+DE)")
+except ImportError:
+    _SPELL_AVAILABLE = False
+    _en_spell = None  # type: ignore[assignment]
+    _de_spell = None  # type: ignore[assignment]
+    logger.warning("pyspellchecker not installed")
+
+
+# ---- Page-Ref Normalization ----
+# Normalizes OCR variants like "p-60", "p 61", "p60" -> "p.60"
+_PAGE_REF_RE = re.compile(r'\bp[\s\-]?(\d+)', re.IGNORECASE)
+
+
+def _normalize_page_ref(text: str) -> str:
+    """Normalize page references: 'p-60' / 'p 61' / 'p60' -> 'p.60'."""
+    if not text:
+        return text
+    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
+
+
+# Suspicious OCR chars -> ordered list of most-likely correct replacements
+_SPELL_SUBS: Dict[str, List[str]] = {
+    '0': ['O', 'o'],
+    '1': ['l', 'I'],
+    '5': ['S', 's'],
+    '6': ['G', 'g'],
+    '8': ['B', 'b'],
+    '|': ['I', 'l', '1'],
+}
+_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
+
+# Tokenizer: word tokens (letters + pipe) alternating with separators
+_SPELL_TOKEN_RE = re.compile(r'([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]*)')
+
+
+def _spell_dict_knows(word: str) -> bool:
+    """True if word is known in EN or DE dictionary."""
+    if not _SPELL_AVAILABLE:
+        return False
+    w = word.lower()
+    return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
+
+
+def _try_split_merged_word(token: str) -> Optional[str]:
+    """Try to split a merged word like 'atmyschool' into 'at my school'.
+
+    Uses dynamic programming to find the shortest sequence of dictionary
+    words that covers the entire token. Only returns a result when the
+    split produces at least 2 words and ALL parts are known dictionary words.
+
+    Preserves original capitalisation by mapping back to the input string.
+    """
+    if not _SPELL_AVAILABLE or len(token) < 4:
+        return None
+
+    lower = token.lower()
+    n = len(lower)
+
+    # dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
+    dp: list = [None] * (n + 1)
+    dp[0] = ([], 0)
+
+    for i in range(1, n + 1):
+        for j in range(max(0, i - 20), i):
+            if dp[j] is None:
+                continue
+            candidate = lower[j:i]
+            word_len = i - j
+            if word_len == 1 and candidate not in ('a', 'i'):
+                continue
+            if _spell_dict_knows(candidate):
+                prev_words, prev_sq = dp[j]
+                new_words = prev_words + [word_len]
+                new_sq = prev_sq + word_len * word_len
+                new_key = (-len(new_words), new_sq)
+                if dp[i] is None:
+                    dp[i] = (new_words, new_sq)
+                else:
+                    old_key = (-len(dp[i][0]), dp[i][1])
+                    if new_key >= old_key:
+                        dp[i] = (new_words, new_sq)
+
+    if dp[n] is None or len(dp[n][0]) < 2:
+        return None
+
+    result = []
+    pos = 0
+    for wlen in dp[n][0]:
+        result.append(token[pos:pos + wlen])
+        pos += wlen
+
+    logger.debug("Split merged word: %r -> %r", token, " ".join(result))
+    return " ".join(result)
+
+
+def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
+    """Return corrected form of token, or None if no fix needed/possible.
+
+    *field* is 'english' or 'german' -- used to pick the right dictionary.
+    """
+    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
+
+    # 1. Already known word -> no fix needed
+    if _spell_dict_knows(token):
+        return None
+
+    # 2. Digit/pipe substitution
+    if has_suspicious:
+        if token == '|':
+            return 'I'
+        for i, ch in enumerate(token):
+            if ch not in _SPELL_SUBS:
+                continue
+            for replacement in _SPELL_SUBS[ch]:
+                candidate = token[:i] + replacement + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+        first = token[0]
+        if first in _SPELL_SUBS and len(token) >= 2:
+            rest = token[1:]
+            if rest.isalpha() and rest.islower():
+                candidate = _SPELL_SUBS[first][0] + rest
+                if not candidate[0].isdigit():
+                    return candidate
+
+    # 3. OCR umlaut confusion
+    if len(token) >= 3 and token.isalpha() and field == "german":
+        _UMLAUT_SUBS = {'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
+                         'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc'}
+        for i, ch in enumerate(token):
+            if ch in _UMLAUT_SUBS:
+                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+
+    # 4. General spell correction for unknown words (no digits/pipes)
+    if not has_suspicious and len(token) >= 3 and token.isalpha():
+        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
+        if spell is not None:
+            correction = spell.correction(token.lower())
+            if correction and correction != token.lower():
+                if token[0].isupper():
+                    correction = correction[0].upper() + correction[1:]
+                if _spell_dict_knows(correction):
+                    return correction
+
+    # 5. Merged-word split
+    if len(token) >= 4 and token.isalpha():
+        split = _try_split_merged_word(token)
+        if split:
+            return split
+
+    return None
+
+
+def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
+    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
+    if not text:
+        return text, False
+    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
+    if not has_suspicious and not any(c.isalpha() for c in text):
+        return text, False
+    # Pattern: | immediately before . or , -> numbered list prefix
+    fixed = re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
+    changed = fixed != text
+    # Tokenize and fix word by word
+    parts: List[str] = []
+    pos = 0
+    for m in _SPELL_TOKEN_RE.finditer(fixed):
+        token, sep = m.group(1), m.group(2)
+        correction = _spell_fix_token(token, field=field)
+        if correction:
+            parts.append(correction)
+            changed = True
+        else:
+            parts.append(token)
+        parts.append(sep)
+        pos = m.end()
+    if pos < len(fixed):
+        parts.append(fixed[pos:])
+    return ''.join(parts), changed
+
+
+def spell_review_entries_sync(entries: List[Dict]) -> Dict:
+    """Rule-based OCR correction: spell-checker + structural heuristics.
+
+    Deterministic -- never translates, never touches IPA, never hallucinates.
+    Uses SmartSpellChecker for language-aware corrections with context-based
+    disambiguation (a/I), multi-digit substitution, and cross-language guard.
+    """
+    from cv_review_llm import _entry_needs_review
+
+    t0 = time.time()
+    changes: List[Dict] = []
+    all_corrected: List[Dict] = []
+
+    # Use SmartSpellChecker if available
+    _smart = None
+    try:
+        from smart_spell import SmartSpellChecker
+        _smart = SmartSpellChecker()
+        logger.debug("spell_review: using SmartSpellChecker")
+    except Exception:
+        logger.debug("spell_review: SmartSpellChecker not available, using legacy")
+
+    _LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
+
+    for i, entry in enumerate(entries):
+        e = dict(entry)
+        # Page-ref normalization
+        old_ref = (e.get("source_page") or "").strip()
+        if old_ref:
+            new_ref = _normalize_page_ref(old_ref)
+            if new_ref != old_ref:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": "source_page",
+                    "old": old_ref,
+                    "new": new_ref,
+                })
+                e["source_page"] = new_ref
+                e["llm_corrected"] = True
+        if not _entry_needs_review(e):
+            all_corrected.append(e)
+            continue
+        for field_name in ("english", "german", "example"):
+            old_val = (e.get(field_name) or "").strip()
+            if not old_val:
+                continue
+
+            if _smart:
+                lang_code = _LANG_MAP.get(field_name, "en")
+                result = _smart.correct_text(old_val, lang=lang_code)
+                new_val = result.corrected
+                was_changed = result.changed
+            else:
+                lang = "german" if field_name in ("german", "example") else "english"
+                new_val, was_changed = _spell_fix_field(old_val, field=lang)
+
+            if was_changed and new_val != old_val:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": field_name,
+                    "old": old_val,
+                    "new": new_val,
+                })
+                e[field_name] = new_val
+                e["llm_corrected"] = True
+        all_corrected.append(e)
+    duration_ms = int((time.time() - t0) * 1000)
+    model_name = "smart-spell-checker" if _smart else "spell-checker"
+    return {
+        "entries_original": entries,
+        "entries_corrected": all_corrected,
+        "changes": changes,
+        "skipped_count": 0,
+        "model_used": model_name,
+        "duration_ms": duration_ms,
+    }
+
+
+async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
+    """Async generator yielding SSE-compatible events for spell-checker review."""
+    total = len(entries)
+    yield {
+        "type": "meta",
+        "total_entries": total,
+        "to_review": total,
+        "skipped": 0,
+        "model": "spell-checker",
+        "batch_size": batch_size,
+    }
+    result = spell_review_entries_sync(entries)
+    changes = result["changes"]
+    yield {
+        "type": "batch",
+        "batch_index": 0,
+        "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
+        "changes": changes,
+        "duration_ms": result["duration_ms"],
+        "progress": {"current": total, "total": total},
+    }
+    yield {
+        "type": "complete",
+        "changes": changes,
+        "model_used": "spell-checker",
+        "duration_ms": result["duration_ms"],
+        "total_entries": total,
+        "reviewed": total,
+        "skipped": 0,
+        "corrections_found": len(changes),
+        "entries_corrected": result["entries_corrected"],
+    }
diff --git a/klausur-service/backend/grid_editor_columns.py b/klausur-service/backend/grid_editor_columns.py
new file mode 100644
index 0000000..6731798
--- /dev/null
+++ b/klausur-service/backend/grid_editor_columns.py
@@ -0,0 +1,492 @@
+"""
+Grid Editor — column detection, cross-column splitting, marker merging.
+
+Split from grid_editor_helpers.py for maintainability.
+All functions are pure computation — no HTTP, DB, or session side effects.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Cross-column word splitting
+# ---------------------------------------------------------------------------
+
+_spell_cache: Optional[Any] = None
+_spell_loaded = False
+
+
+def _is_recognized_word(text: str) -> bool:
+    """Check if *text* is a recognized German or English word.
+
+    Uses the spellchecker library (same as cv_syllable_detect.py).
+    Returns True for real words like "oder", "Kabel", "Zeitung".
+    Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
+    """
+    global _spell_cache, _spell_loaded
+    if not text or len(text) < 2:
+        return False
+
+    if not _spell_loaded:
+        _spell_loaded = True
+        try:
+            from spellchecker import SpellChecker
+            _spell_cache = SpellChecker(language="de")
+        except Exception:
+            pass
+
+    if _spell_cache is None:
+        return False
+
+    return text.lower() in _spell_cache
+
+
+def _split_cross_column_words(
+    words: List[Dict],
+    columns: List[Dict],
+) -> List[Dict]:
+    """Split word boxes that span across column boundaries.
+
+    When OCR merges adjacent words from different columns (e.g. "sichzie"
+    spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
+    split the word box at the column boundary so each piece is assigned
+    to the correct column.
+
+    Only splits when:
+    - The word has significant overlap (>15% of its width) on both sides
+    - AND the word is not a recognized real word (OCR merge artifact), OR
+      the word contains a case transition (lowercase->uppercase) near the
+      boundary indicating two merged words like "dasZimmer".
+    """
+    if len(columns) < 2:
+        return words
+
+    # Column boundaries = midpoints between adjacent column edges
+    boundaries = []
+    for i in range(len(columns) - 1):
+        boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
+        boundaries.append(boundary)
+
+    new_words: List[Dict] = []
+    split_count = 0
+    for w in words:
+        w_left = w["left"]
+        w_width = w["width"]
+        w_right = w_left + w_width
+        text = (w.get("text") or "").strip()
+
+        if not text or len(text) < 4 or w_width < 10:
+            new_words.append(w)
+            continue
+
+        # Find the first boundary this word straddles significantly
+        split_boundary = None
+        for b in boundaries:
+            if w_left < b < w_right:
+                left_part = b - w_left
+                right_part = w_right - b
+                # Both sides must have at least 15% of the word width
+                if left_part > w_width * 0.15 and right_part > w_width * 0.15:
+                    split_boundary = b
+                    break
+
+        if split_boundary is None:
+            new_words.append(w)
+            continue
+
+        # Compute approximate split position in the text.
+        left_width = split_boundary - w_left
+        split_ratio = left_width / w_width
+        approx_pos = len(text) * split_ratio
+
+        # Strategy 1: look for a case transition (lowercase->uppercase) near
+        # the approximate split point — e.g. "dasZimmer" splits at 'Z'.
+        split_char = None
+        search_lo = max(1, int(approx_pos) - 3)
+        search_hi = min(len(text), int(approx_pos) + 2)
+        for i in range(search_lo, search_hi):
+            if text[i - 1].islower() and text[i].isupper():
+                split_char = i
+                break
+
+        # Strategy 2: if no case transition, only split if the whole word
+        # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
+        # Real words like "oder", "Kabel", "Zeitung" must not be split.
+        if split_char is None:
+            clean = re.sub(r"[,;:.!?]+$", "", text)  # strip trailing punct
+            if _is_recognized_word(clean):
+                new_words.append(w)
+                continue
+            # Not a real word — use floor of proportional position
+            split_char = max(1, min(len(text) - 1, int(approx_pos)))
+
+        left_text = text[:split_char].rstrip()
+        right_text = text[split_char:].lstrip()
+
+        if len(left_text) < 2 or len(right_text) < 2:
+            new_words.append(w)
+            continue
+
+        right_width = w_width - round(left_width)
+        new_words.append({
+            **w,
+            "text": left_text,
+            "width": round(left_width),
+        })
+        new_words.append({
+            **w,
+            "text": right_text,
+            "left": round(split_boundary),
+            "width": right_width,
+        })
+        split_count += 1
+        logger.info(
+            "split cross-column word %r -> %r + %r at boundary %.0f",
+            text, left_text, right_text, split_boundary,
+        )
+
+    if split_count:
+        logger.info("split %d cross-column word(s)", split_count)
+    return new_words
+
+
+def _cluster_columns_by_alignment(
+    words: List[Dict],
+    zone_w: int,
+    rows: List[Dict],
+) -> List[Dict[str, Any]]:
+    """Detect columns by clustering left-edge alignment across rows.
+
+    Hybrid approach:
+      1. Group words by row, find "group start" positions within each row
+         (words preceded by a large gap or first word in row)
+      2. Cluster group-start left-edges by X-proximity across rows
+      3. Filter by row coverage (how many rows have a group start here)
+      4. Merge nearby clusters
+      5. Build column boundaries
+
+    This filters out mid-phrase word positions (e.g. IPA transcriptions,
+    second words in multi-word entries) by only considering positions
+    where a new word group begins within a row.
+    """
+    if not words or not rows:
+        return []
+
+    total_rows = len(rows)
+    if total_rows == 0:
+        return []
+
+    # --- Group words by row ---
+    row_words: Dict[int, List[Dict]] = {}
+    for w in words:
+        y_center = w["top"] + w["height"] / 2
+        best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
+        row_words.setdefault(best["index"], []).append(w)
+
+    # --- Compute adaptive gap threshold for group-start detection ---
+    all_gaps: List[float] = []
+    for ri, rw_list in row_words.items():
+        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
+        for i in range(len(sorted_rw) - 1):
+            right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
+            gap = sorted_rw[i + 1]["left"] - right
+            if gap > 0:
+                all_gaps.append(gap)
+
+    if all_gaps:
+        sorted_gaps = sorted(all_gaps)
+        median_gap = sorted_gaps[len(sorted_gaps) // 2]
+        heights = [w["height"] for w in words if w.get("height", 0) > 0]
+        median_h = sorted(heights)[len(heights) // 2] if heights else 25
+
+        # For small word counts (boxes, sub-zones): PaddleOCR returns
+        # multi-word blocks, so ALL inter-word gaps are potential column
+        # boundaries.  Use a low threshold based on word height — any gap
+        # wider than ~1x median word height is a column separator.
+        if len(words) <= 60:
+            gap_threshold = max(median_h * 1.0, 25)
+            logger.info(
+                "alignment columns (small zone): gap_threshold=%.0f "
+                "(median_h=%.0f, %d words, %d gaps: %s)",
+                gap_threshold, median_h, len(words), len(sorted_gaps),
+                [int(g) for g in sorted_gaps[:10]],
+            )
+        else:
+            # Standard approach for large zones (full pages)
+            gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
+            # Cap at 25% of zone width
+            max_gap = zone_w * 0.25
+            if gap_threshold > max_gap > 30:
+                logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w)
+                gap_threshold = max_gap
+    else:
+        gap_threshold = 50
+
+    # --- Find group-start positions (left-edges that begin a new column) ---
+    start_positions: List[tuple] = []  # (left_edge, row_index)
+    for ri, rw_list in row_words.items():
+        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
+        # First word in row is always a group start
+        start_positions.append((sorted_rw[0]["left"], ri))
+        for i in range(1, len(sorted_rw)):
+            right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
+            gap = sorted_rw[i]["left"] - right_prev
+            if gap >= gap_threshold:
+                start_positions.append((sorted_rw[i]["left"], ri))
+
+    start_positions.sort(key=lambda x: x[0])
+
+    logger.info(
+        "alignment columns: %d group-start positions from %d words "
+        "(gap_threshold=%.0f, %d rows)",
+        len(start_positions), len(words), gap_threshold, total_rows,
+    )
+
+    if not start_positions:
+        x_min = min(w["left"] for w in words)
+        x_max = max(w["left"] + w["width"] for w in words)
+        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
+
+    # --- Cluster group-start positions by X-proximity ---
+    tolerance = max(10, int(zone_w * 0.01))
+    clusters: List[Dict[str, Any]] = []
+    cur_edges = [start_positions[0][0]]
+    cur_rows = {start_positions[0][1]}
+
+    for left, row_idx in start_positions[1:]:
+        if left - cur_edges[-1] <= tolerance:
+            cur_edges.append(left)
+            cur_rows.add(row_idx)
+        else:
+            clusters.append({
+                "mean_x": int(sum(cur_edges) / len(cur_edges)),
+                "min_edge": min(cur_edges),
+                "max_edge": max(cur_edges),
+                "count": len(cur_edges),
+                "distinct_rows": len(cur_rows),
+                "row_coverage": len(cur_rows) / total_rows,
+            })
+            cur_edges = [left]
+            cur_rows = {row_idx}
+    clusters.append({
+        "mean_x": int(sum(cur_edges) / len(cur_edges)),
+        "min_edge": min(cur_edges),
+        "max_edge": max(cur_edges),
+        "count": len(cur_edges),
+        "distinct_rows": len(cur_rows),
+        "row_coverage": len(cur_rows) / total_rows,
+    })
+
+    # --- Filter by row coverage ---
+    # These thresholds must be high enough to avoid false columns in flowing
+    # text (random inter-word gaps) while still detecting real columns in
+    # vocabulary worksheets (which typically have >80% row coverage).
+    MIN_COVERAGE_PRIMARY = 0.35
+    MIN_COVERAGE_SECONDARY = 0.12
+    MIN_WORDS_SECONDARY = 4
+    MIN_DISTINCT_ROWS = 3
+
+    # Content boundary for left-margin detection
+    content_x_min = min(w["left"] for w in words)
+    content_x_max = max(w["left"] + w["width"] for w in words)
+    content_span = content_x_max - content_x_min
+
+    primary = [
+        c for c in clusters
+        if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
+        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
+    ]
+    primary_ids = {id(c) for c in primary}
+    secondary = [
+        c for c in clusters
+        if id(c) not in primary_ids
+        and c["row_coverage"] >= MIN_COVERAGE_SECONDARY
+        and c["count"] >= MIN_WORDS_SECONDARY
+        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
+    ]
+
+    # Tertiary: narrow left-margin columns (page refs, markers) that have
+    # too few rows for secondary but are clearly left-aligned and separated
+    # from the main content.  These appear at the far left or far right and
+    # have a large gap to the nearest significant cluster.
+    used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
+    sig_xs = [c["mean_x"] for c in primary + secondary]
+
+    # Tertiary: clusters that are clearly to the LEFT of the first
+    # significant column (or RIGHT of the last).  If words consistently
+    # start at a position left of the established first column boundary,
+    # they MUST be a separate column — regardless of how few rows they
+    # cover.  The only requirement is a clear spatial gap.
+    MIN_COVERAGE_TERTIARY = 0.02  # at least 1 row effectively
+    tertiary = []
+    for c in clusters:
+        if id(c) in used_ids:
+            continue
+        if c["distinct_rows"] < 1:
+            continue
+        if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
+            continue
+        # Must be near left or right content margin (within 15%)
+        rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
+        if not (rel_pos < 0.15 or rel_pos > 0.85):
+            continue
+        # Must have significant gap to nearest significant cluster
+        if sig_xs:
+            min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
+            if min_dist < max(30, content_span * 0.02):
+                continue
+        tertiary.append(c)
+
+    if tertiary:
+        for c in tertiary:
+            logger.info(
+                "  tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
+                c["mean_x"], c["min_edge"], c["max_edge"],
+                c["count"], c["distinct_rows"], c["row_coverage"] * 100,
+            )
+
+    significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
+
+    for c in significant:
+        logger.info(
+            "  significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
+            c["mean_x"], c["min_edge"], c["max_edge"],
+            c["count"], c["distinct_rows"], c["row_coverage"] * 100,
+        )
+    logger.info(
+        "alignment columns: %d clusters, %d primary, %d secondary -> %d significant",
+        len(clusters), len(primary), len(secondary), len(significant),
+    )
+
+    if not significant:
+        # Fallback: single column covering all content
+        x_min = min(w["left"] for w in words)
+        x_max = max(w["left"] + w["width"] for w in words)
+        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
+
+    # --- Merge nearby clusters ---
+    merge_distance = max(25, int(zone_w * 0.03))
+    merged = [significant[0].copy()]
+    for s in significant[1:]:
+        if s["mean_x"] - merged[-1]["mean_x"] < merge_distance:
+            prev = merged[-1]
+            total = prev["count"] + s["count"]
+            prev["mean_x"] = (
+                prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"]
+            ) // total
+            prev["count"] = total
+            prev["min_edge"] = min(prev["min_edge"], s["min_edge"])
+            prev["max_edge"] = max(prev["max_edge"], s["max_edge"])
+            prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"])
+        else:
+            merged.append(s.copy())
+
+    logger.info(
+        "alignment columns: %d after merge (distance=%d)",
+        len(merged), merge_distance,
+    )
+
+    # --- Build column boundaries ---
+    margin = max(5, int(zone_w * 0.005))
+    content_x_min = min(w["left"] for w in words)
+    content_x_max = max(w["left"] + w["width"] for w in words)
+
+    columns: List[Dict[str, Any]] = []
+    for i, cluster in enumerate(merged):
+        x_min = max(content_x_min, cluster["min_edge"] - margin)
+        if i + 1 < len(merged):
+            x_max = merged[i + 1]["min_edge"] - margin
+        else:
+            x_max = content_x_max
+
+        columns.append({
+            "index": i,
+            "type": f"column_{i + 1}" if len(merged) > 1 else "column_text",
+            "x_min": x_min,
+            "x_max": x_max,
+        })
+
+    return columns
+
+
+_MARKER_CHARS = set("*-+#>")
+
+
+def _merge_inline_marker_columns(
+    columns: List[Dict],
+    words: List[Dict],
+) -> List[Dict]:
+    """Merge narrow marker columns (bullets, numbering) into adjacent text.
+
+    Bullet points (*, -) and numbering (1., 2.) create narrow columns
+    at the left edge of a zone.  These are inline markers that indent text,
+    not real separate columns.  Merge them with their right neighbour.
+
+    Does NOT merge columns containing alphabetic words like "to", "in",
+    "der", "die", "das" — those are legitimate content columns.
+    """
+    if len(columns) < 2:
+        return columns
+
+    merged: List[Dict] = []
+    skip: set = set()
+
+    for i, col in enumerate(columns):
+        if i in skip:
+            continue
+
+        # Find words in this column
+        col_words = [
+            w for w in words
+            if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
+        ]
+        col_width = col["x_max"] - col["x_min"]
+
+        # Narrow column with mostly short words -> MIGHT be inline markers
+        if col_words and col_width < 80:
+            avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
+            if avg_len <= 2 and i + 1 < len(columns):
+                # Check if words are actual markers (symbols/numbers) vs
+                # real alphabetic words like "to", "in", "der", "die"
+                texts = [(w.get("text") or "").strip() for w in col_words]
+                alpha_count = sum(
+                    1 for t in texts
+                    if t and t[0].isalpha() and t not in _MARKER_CHARS
+                )
+                alpha_ratio = alpha_count / len(texts) if texts else 0
+
+                # If >=50% of words are alphabetic, this is a real column
+                if alpha_ratio >= 0.5:
+                    logger.info(
+                        "  kept narrow column %d (w=%d, avg_len=%.1f, "
+                        "alpha=%.0f%%) -- contains real words",
+                        i, col_width, avg_len, alpha_ratio * 100,
+                    )
+                else:
+                    # Merge into next column
+                    next_col = columns[i + 1].copy()
+                    next_col["x_min"] = col["x_min"]
+                    merged.append(next_col)
+                    skip.add(i + 1)
+                    logger.info(
+                        "  merged inline marker column %d (w=%d, avg_len=%.1f) "
+                        "into column %d",
+                        i, col_width, avg_len, i + 1,
+                    )
+                    continue
+
+        merged.append(col)
+
+    # Re-index
+    for i, col in enumerate(merged):
+        col["index"] = i
+        col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
+
+    return merged
diff --git a/klausur-service/backend/grid_editor_filters.py b/klausur-service/backend/grid_editor_filters.py
new file mode 100644
index 0000000..c938569
--- /dev/null
+++ b/klausur-service/backend/grid_editor_filters.py
@@ -0,0 +1,402 @@
+"""
+Grid Editor — word/zone filtering, border ghosts, decorative margins, footers.
+
+Split from grid_editor_helpers.py for maintainability.
+All functions are pure computation — no HTTP, DB, or session side effects.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
+    """Remove page-border decoration strip words BEFORE column detection.
+
+    Scans from each page edge inward to find the first significant x-gap
+    (>30 px).  If the edge cluster contains <15 % of total words, those
+    words are removed as border-strip artifacts (alphabet letters,
+    illustration fragments).
+
+    Must run BEFORE ``_build_zone_grid`` so that column detection only
+    sees real content words and doesn't produce inflated row counts.
+    """
+    if len(words) < 10:
+        return words, 0
+
+    sorted_words = sorted(words, key=lambda w: w.get("left", 0))
+    total = len(sorted_words)
+
+    # -- Left-edge scan (running max right-edge) --
+    left_count = 0
+    running_right = 0
+    for gi in range(total - 1):
+        running_right = max(
+            running_right,
+            sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
+        )
+        if sorted_words[gi + 1].get("left", 0) - running_right > 30:
+            left_count = gi + 1
+            break
+
+    # -- Right-edge scan (running min left) --
+    right_count = 0
+    running_left = sorted_words[-1].get("left", 0)
+    for gi in range(total - 1, 0, -1):
+        running_left = min(running_left, sorted_words[gi].get("left", 0))
+        prev_right = (
+            sorted_words[gi - 1].get("left", 0)
+            + sorted_words[gi - 1].get("width", 0)
+        )
+        if running_left - prev_right > 30:
+            right_count = total - gi
+            break
+
+    # Validate candidate strip: real border decorations are mostly short
+    # words (alphabet letters like "A", "Bb", stray marks).  Multi-word
+    # content like "der Ranzen" or "die Schals" (continuation of German
+    # translations) must NOT be removed.
+    def _is_decorative_strip(candidates: List[Dict]) -> bool:
+        if not candidates:
+            return False
+        short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
+        return short / len(candidates) >= 0.45
+
+    strip_ids: set = set()
+    if left_count > 0 and left_count / total < 0.20:
+        candidates = sorted_words[:left_count]
+        if _is_decorative_strip(candidates):
+            strip_ids = {id(w) for w in candidates}
+    elif right_count > 0 and right_count / total < 0.20:
+        candidates = sorted_words[total - right_count:]
+        if _is_decorative_strip(candidates):
+            strip_ids = {id(w) for w in candidates}
+
+    if not strip_ids:
+        return words, 0
+
+    return [w for w in words if id(w) not in strip_ids], len(strip_ids)
+
+
+# Characters that are typically OCR artefacts from box border lines.
+# Intentionally excludes ! (red markers) and . , ; (real punctuation).
+_GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+")
+
+
+def _filter_border_ghosts(
+    words: List[Dict],
+    boxes: List,
+) -> tuple:
+    """Remove words sitting on box borders that are OCR artefacts.
+
+    Returns (filtered_words, removed_count).
+    """
+    if not boxes or not words:
+        return words, 0
+
+    # Build border bands from detected boxes
+    x_bands: List[tuple] = []
+    y_bands: List[tuple] = []
+    for b in boxes:
+        bt = (
+            b.border_thickness
+            if hasattr(b, "border_thickness")
+            else b.get("border_thickness", 3)
+        )
+        # Skip borderless boxes (images/graphics) -- no border line to produce ghosts
+        if bt == 0:
+            continue
+        bx = b.x if hasattr(b, "x") else b.get("x", 0)
+        by = b.y if hasattr(b, "y") else b.get("y", 0)
+        bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
+        bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
+        margin = max(bt * 2, 10) + 6
+        x_bands.append((bx - margin, bx + margin))
+        x_bands.append((bx + bw - margin, bx + bw + margin))
+        y_bands.append((by - margin, by + margin))
+        y_bands.append((by + bh - margin, by + bh + margin))
+
+    def _is_ghost(w: Dict) -> bool:
+        text = (w.get("text") or "").strip()
+        if not text:
+            return False
+        # Check if any word edge (not just center) touches a border band
+        w_left = w["left"]
+        w_right = w["left"] + w["width"]
+        w_top = w["top"]
+        w_bottom = w["top"] + w["height"]
+        on_border = (
+            any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
+            or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
+        )
+        if not on_border:
+            return False
+        if len(text) == 1 and text in _GRID_GHOST_CHARS:
+            return True
+        return False
+
+    filtered = [w for w in words if not _is_ghost(w)]
+    return filtered, len(words) - len(filtered)
+
+
+def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
+    """Extract all word_boxes from cells into a flat list of word dicts."""
+    words: List[Dict] = []
+    for cell in cells:
+        for wb in cell.get("word_boxes") or []:
+            if wb.get("text", "").strip():
+                words.append({
+                    "text": wb["text"],
+                    "left": wb["left"],
+                    "top": wb["top"],
+                    "width": wb["width"],
+                    "height": wb["height"],
+                    "conf": wb.get("conf", 0),
+                })
+    return words
+
+
+def _words_in_zone(
+    words: List[Dict],
+    zone_y: int,
+    zone_h: int,
+    zone_x: int,
+    zone_w: int,
+) -> List[Dict]:
+    """Filter words whose Y-center falls within a zone's bounds."""
+    zone_y_end = zone_y + zone_h
+    zone_x_end = zone_x + zone_w
+    result = []
+    for w in words:
+        cy = w["top"] + w["height"] / 2
+        cx = w["left"] + w["width"] / 2
+        if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end:
+            result.append(w)
+    return result
+
+
+def _get_content_bounds(words: List[Dict]) -> tuple:
+    """Get content bounds from word positions."""
+    if not words:
+        return 0, 0, 0, 0
+    x_min = min(w["left"] for w in words)
+    y_min = min(w["top"] for w in words)
+    x_max = max(w["left"] + w["width"] for w in words)
+    y_max = max(w["top"] + w["height"] for w in words)
+    return x_min, y_min, x_max - x_min, y_max - y_min
+
+
+def _filter_decorative_margin(
+    words: List[Dict],
+    img_w: int,
+    log: Any,
+    session_id: str,
+) -> Dict[str, Any]:
+    """Remove words that belong to a decorative alphabet strip on a margin.
+
+    Some vocabulary worksheets have a vertical A-Z alphabet graphic along
+    the left or right edge.  OCR reads each letter as an isolated single-
+    character word.  These decorative elements are not content and confuse
+    column/row detection.
+
+    Detection criteria (phase 1 -- find the strip using single-char words):
+      - Words are in the outer 30% of the page (left or right)
+      - Nearly all words are single characters (letters or digits)
+      - At least 8 such words form a vertical strip (>=8 unique Y positions)
+      - Average horizontal spread of the strip is small (< 80px)
+
+    Phase 2 -- once a strip is confirmed, also remove any short word (<=3
+    chars) in the same narrow x-range.  This catches multi-char OCR
+    artifacts like "Vv" that belong to the same decorative element.
+
+    Modifies *words* in place.
+
+    Returns:
+        Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
+    """
+    no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
+    if not words or img_w <= 0:
+        return no_strip
+
+    margin_cutoff = img_w * 0.30
+    # Phase 1: find candidate strips using short words (1-2 chars).
+    # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
+    # rather than singles, so accept <=2-char words as strip candidates.
+    left_strip = [
+        w for w in words
+        if len((w.get("text") or "").strip()) <= 2
+        and w["left"] + w.get("width", 0) / 2 < margin_cutoff
+    ]
+    right_strip = [
+        w for w in words
+        if len((w.get("text") or "").strip()) <= 2
+        and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
+    ]
+
+    for strip, side in [(left_strip, "left"), (right_strip, "right")]:
+        if len(strip) < 6:
+            continue
+        # Check vertical distribution: should have many distinct Y positions
+        y_centers = sorted(set(
+            int(w["top"] + w.get("height", 0) / 2) // 20 * 20  # bucket
+            for w in strip
+        ))
+        if len(y_centers) < 6:
+            continue
+        # Check horizontal compactness
+        x_positions = [w["left"] for w in strip]
+        x_min = min(x_positions)
+        x_max = max(x_positions)
+        x_spread = x_max - x_min
+        if x_spread > 80:
+            continue
+
+        # Phase 2: strip confirmed -- also collect short words in same x-range
+        # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
+        strip_x_lo = x_min - 20
+        strip_x_hi = x_max + 60  # word width + tolerance
+        all_strip_words = [
+            w for w in words
+            if len((w.get("text") or "").strip()) <= 3
+            and strip_x_lo <= w["left"] <= strip_x_hi
+            and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
+                 if side == "left"
+                 else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
+        ]
+
+        strip_set = set(id(w) for w in all_strip_words)
+        before = len(words)
+        words[:] = [w for w in words if id(w) not in strip_set]
+        removed = before - len(words)
+        if removed:
+            log.info(
+                "build-grid session %s: removed %d decorative %s-margin words "
+                "(strip x=%d-%d)",
+                session_id, removed, side, strip_x_lo, strip_x_hi,
+            )
+        return {"found": True, "side": side, "letters_detected": len(strip)}
+
+    return no_strip
+
+
+def _filter_footer_words(
+    words: List[Dict],
+    img_h: int,
+    log: Any,
+    session_id: str,
+) -> Optional[Dict]:
+    """Remove isolated words in the bottom 5% of the page (page numbers).
+
+    Modifies *words* in place and returns a page_number metadata dict
+    if a page number was extracted, or None.
+    """
+    if not words or img_h <= 0:
+        return None
+    footer_y = img_h * 0.95
+    footer_words = [
+        w for w in words
+        if w["top"] + w.get("height", 0) / 2 > footer_y
+    ]
+    if not footer_words:
+        return None
+    # Only remove if footer has very few words (<= 3) with short text
+    total_text = "".join((w.get("text") or "").strip() for w in footer_words)
+    if len(footer_words) <= 3 and len(total_text) <= 10:
+        # Extract page number metadata before removing
+        page_number_info = {
+            "text": total_text.strip(),
+            "y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
+        }
+        # Try to parse as integer
+        digits = "".join(c for c in total_text if c.isdigit())
+        if digits:
+            page_number_info["number"] = int(digits)
+
+        footer_set = set(id(w) for w in footer_words)
+        words[:] = [w for w in words if id(w) not in footer_set]
+        log.info(
+            "build-grid session %s: extracted page number '%s' and removed %d footer words",
+            session_id, total_text, len(footer_words),
+        )
+        return page_number_info
+    return None
+
+
+def _filter_header_junk(
+    words: List[Dict],
+    img_h: int,
+    log: Any,
+    session_id: str,
+) -> None:
+    """Remove OCR junk from header illustrations above the real content.
+
+    Textbook pages often have decorative header graphics (illustrations,
+    icons) that OCR reads as low-confidence junk characters.  Real content
+    typically starts further down the page.
+
+    Algorithm:
+      1. Find the "content start" -- the first Y position where a dense
+         horizontal row of 3+ high-confidence words begins.
+      2. Above that line, remove words with conf < 75 and text <= 3 chars.
+         These are almost certainly OCR artifacts from illustrations.
+
+    Modifies *words* in place.
+    """
+    if not words or img_h <= 0:
+        return
+
+    # --- Find content start: first horizontal row with >=3 high-conf words ---
+    # Sort words by Y
+    sorted_by_y = sorted(words, key=lambda w: w["top"])
+    content_start_y = 0
+    _ROW_TOLERANCE = img_h * 0.02  # words within 2% of page height = same row
+    _MIN_ROW_WORDS = 3
+    _MIN_CONF = 80
+
+    i = 0
+    while i < len(sorted_by_y):
+        row_y = sorted_by_y[i]["top"]
+        # Collect words in this row band
+        row_words = []
+        j = i
+        while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
+            row_words.append(sorted_by_y[j])
+            j += 1
+        # Count high-confidence words with real text (> 1 char)
+        high_conf = [
+            w for w in row_words
+            if w.get("conf", 0) >= _MIN_CONF
+            and len((w.get("text") or "").strip()) > 1
+        ]
+        if len(high_conf) >= _MIN_ROW_WORDS:
+            content_start_y = row_y
+            break
+        i = j if j > i else i + 1
+
+    if content_start_y <= 0:
+        return  # no clear content start found
+
+    # --- Remove low-conf short junk above content start ---
+    junk = [
+        w for w in words
+        if w["top"] + w.get("height", 0) < content_start_y
+        and w.get("conf", 0) < 75
+        and len((w.get("text") or "").strip()) <= 3
+    ]
+    if not junk:
+        return
+
+    junk_set = set(id(w) for w in junk)
+    before = len(words)
+    words[:] = [w for w in words if id(w) not in junk_set]
+    removed = before - len(words)
+    if removed:
+        log.info(
+            "build-grid session %s: removed %d header junk words above y=%d "
+            "(content start)",
+            session_id, removed, content_start_y,
+        )
diff --git a/klausur-service/backend/grid_editor_headers.py b/klausur-service/backend/grid_editor_headers.py
new file mode 100644
index 0000000..3096e59
--- /dev/null
+++ b/klausur-service/backend/grid_editor_headers.py
@@ -0,0 +1,499 @@
+"""
+Grid Editor — header/heading detection and colspan (merged cell) detection.
+Split from grid_editor_helpers.py.  Pure computation, no HTTP/DB side effects.
+Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from cv_ocr_engines import _text_has_garbled_ipa
+
+logger = logging.getLogger(__name__)
+
+
+def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
+    """Detect heading rows by color + height after color annotation.
+
+    A row is a heading if:
+    1. ALL word_boxes have color_name != 'black' (typically 'blue')
+    2. Mean word height > 1.2x median height of all words in the zone
+
+    Detected heading rows are merged into a single spanning cell.
+    Returns count of headings detected.
+    """
+    heading_count = 0
+
+    for z in zones_data:
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        columns = z.get("columns", [])
+        if not cells or not rows or len(columns) < 2:
+            continue
+
+        # Compute median word height across the zone
+        all_heights = []
+        for cell in cells:
+            for wb in cell.get("word_boxes") or []:
+                h = wb.get("height", 0)
+                if h > 0:
+                    all_heights.append(h)
+        if not all_heights:
+            continue
+        all_heights_sorted = sorted(all_heights)
+        median_h = all_heights_sorted[len(all_heights_sorted) // 2]
+
+        heading_row_indices = []
+        for row in rows:
+            if row.get("is_header"):
+                continue  # already detected as header
+            ri = row["index"]
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            row_wbs = [
+                wb for cell in row_cells
+                for wb in cell.get("word_boxes") or []
+            ]
+            if not row_wbs:
+                continue
+
+            # Condition 1: ALL words are non-black
+            all_colored = all(
+                wb.get("color_name", "black") != "black"
+                for wb in row_wbs
+            )
+            if not all_colored:
+                continue
+
+            # Condition 2: mean height > 1.2x median
+            mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
+            if mean_h <= median_h * 1.2:
+                continue
+
+            heading_row_indices.append(ri)
+
+        # Merge heading cells into spanning cells
+        for hri in heading_row_indices:
+            header_cells = [c for c in cells if c.get("row_index") == hri]
+            if len(header_cells) <= 1:
+                # Single cell -- just mark it as heading
+                if header_cells:
+                    header_cells[0]["col_type"] = "heading"
+                    heading_count += 1
+                    # Mark row as header
+                    for row in rows:
+                        if row["index"] == hri:
+                            row["is_header"] = True
+                continue
+
+            # Collect all word_boxes and text from all columns
+            all_wb = []
+            all_text_parts = []
+            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
+                all_wb.extend(hc.get("word_boxes", []))
+                if hc.get("text", "").strip():
+                    all_text_parts.append(hc["text"].strip())
+
+            # Remove all cells for this row, replace with one spanning cell
+            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
+
+            if all_wb:
+                x_min = min(wb["left"] for wb in all_wb)
+                y_min = min(wb["top"] for wb in all_wb)
+                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+
+                # Use the actual starting col_index from the first cell
+                first_col = min(hc["col_index"] for hc in header_cells)
+                zone_idx = z.get("zone_index", 0)
+                z["cells"].append({
+                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
+                    "zone_index": zone_idx,
+                    "row_index": hri,
+                    "col_index": first_col,
+                    "col_type": "heading",
+                    "text": " ".join(all_text_parts),
+                    "confidence": 0.0,
+                    "bbox_px": {"x": x_min, "y": y_min,
+                                "w": x_max - x_min, "h": y_max - y_min},
+                    "bbox_pct": {
+                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                    },
+                    "word_boxes": all_wb,
+                    "ocr_engine": "words_first",
+                    "is_bold": True,
+                })
+
+            # Mark row as header
+            for row in rows:
+                if row["index"] == hri:
+                    row["is_header"] = True
+            heading_count += 1
+
+    return heading_count
+
+
+def _detect_heading_rows_by_single_cell(
+    zones_data: List[Dict], img_w: int, img_h: int,
+) -> int:
+    """Detect heading rows that have only a single content cell.
+
+    Black headings like "Theme" have normal color and height, so they are
+    missed by ``_detect_heading_rows_by_color``.  The distinguishing signal
+    is that they occupy only one column while normal vocabulary rows fill
+    at least 2-3 columns.
+
+    A row qualifies as a heading if:
+    1. It is not already marked as a header/heading.
+    2. It has exactly ONE cell whose col_type starts with ``column_``
+       (excluding column_1 / page_ref which only carries page numbers).
+    3. That single cell is NOT in the last column (continuation/example
+       lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
+    4. The text does not start with ``[`` (IPA continuation).
+    5. The zone has >=3 columns and >=5 rows (avoids false positives in
+       tiny zones).
+    6. The majority of rows in the zone have >=2 content cells (ensures
+       we are in a multi-column vocab layout).
+    """
+    heading_count = 0
+
+    for z in zones_data:
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        columns = z.get("columns", [])
+        if len(columns) < 3 or len(rows) < 5:
+            continue
+
+        # Determine the last col_index (example/sentence column)
+        col_indices = sorted(set(c.get("col_index", 0) for c in cells))
+        if not col_indices:
+            continue
+        last_col = col_indices[-1]
+
+        # Count content cells per row (column_* but not column_1/page_ref).
+        # Exception: column_1 cells that contain a dictionary article word
+        # (die/der/das etc.) ARE content -- they appear in dictionary layouts
+        # where the leftmost column holds grammatical articles.
+        _ARTICLE_WORDS = {
+            "die", "der", "das", "dem", "den", "des", "ein", "eine",
+            "the", "a", "an",
+        }
+        row_content_counts: Dict[int, int] = {}
+        for cell in cells:
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            if ct == "column_1":
+                ctext = (cell.get("text") or "").strip().lower()
+                if ctext not in _ARTICLE_WORDS:
+                    continue
+            ri = cell.get("row_index", -1)
+            row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
+
+        # Majority of rows must have >=2 content cells
+        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
+        if multi_col_rows < len(rows) * 0.4:
+            continue
+
+        # Exclude first and last non-header rows -- these are typically
+        # page numbers or footer text, not headings.
+        non_header_rows = [r for r in rows if not r.get("is_header")]
+        if len(non_header_rows) < 3:
+            continue
+        first_ri = non_header_rows[0]["index"]
+        last_ri = non_header_rows[-1]["index"]
+
+        heading_row_indices = []
+        for row in rows:
+            if row.get("is_header"):
+                continue
+            ri = row["index"]
+            if ri == first_ri or ri == last_ri:
+                continue
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            content_cells = [
+                c for c in row_cells
+                if c.get("col_type", "").startswith("column_")
+                and (c.get("col_type") != "column_1"
+                     or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
+            ]
+            if len(content_cells) != 1:
+                continue
+            cell = content_cells[0]
+            # Not in the last column (continuation/example lines)
+            if cell.get("col_index") == last_col:
+                continue
+            text = (cell.get("text") or "").strip()
+            if not text or text.startswith("["):
+                continue
+            # Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
+            if text.startswith("("):
+                continue
+            # Single cell NOT in the first content column is likely a
+            # continuation/overflow line, not a heading.  Real headings
+            # ("Theme 1", "Unit 3: ...") appear in the first or second
+            # content column.
+            first_content_col = col_indices[0] if col_indices else 0
+            if cell.get("col_index", 0) > first_content_col + 1:
+                continue
+            # Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
+            # but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
+            _REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
+            if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
+                continue
+            # Guard: dictionary section headings are short (1-4 alpha chars
+            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
+            # lowercase is a regular vocabulary word (e.g. "zentral") that
+            # happens to appear alone in its row.
+            alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
+            if len(alpha_only) > 4 and text[0].islower():
+                continue
+            heading_row_indices.append(ri)
+
+        # Guard: if >25% of eligible rows would become headings, the
+        # heuristic is misfiring (e.g. sparse single-column layout where
+        # most rows naturally have only 1 content cell).
+        eligible_rows = len(non_header_rows) - 2  # minus first/last excluded
+        if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
+            logger.debug(
+                "Skipping single-cell heading detection for zone %s: "
+                "%d/%d rows would be headings (>25%%)",
+                z.get("zone_index"), len(heading_row_indices), eligible_rows,
+            )
+            continue
+
+        for hri in heading_row_indices:
+            header_cells = [c for c in cells if c.get("row_index") == hri]
+            if not header_cells:
+                continue
+
+            # Collect all word_boxes and text
+            all_wb = []
+            all_text_parts = []
+            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
+                all_wb.extend(hc.get("word_boxes", []))
+                if hc.get("text", "").strip():
+                    all_text_parts.append(hc["text"].strip())
+
+            first_col_idx = min(hc["col_index"] for hc in header_cells)
+
+            # Remove old cells for this row, add spanning heading cell
+            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
+
+            if all_wb:
+                x_min = min(wb["left"] for wb in all_wb)
+                y_min = min(wb["top"] for wb in all_wb)
+                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+            else:
+                # Fallback to first cell bbox
+                bp = header_cells[0].get("bbox_px", {})
+                x_min = bp.get("x", 0)
+                y_min = bp.get("y", 0)
+                x_max = x_min + bp.get("w", 0)
+                y_max = y_min + bp.get("h", 0)
+
+            zone_idx = z.get("zone_index", 0)
+            z["cells"].append({
+                "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
+                "zone_index": zone_idx,
+                "row_index": hri,
+                "col_index": first_col_idx,
+                "col_type": "heading",
+                "text": " ".join(all_text_parts),
+                "confidence": 0.0,
+                "bbox_px": {"x": x_min, "y": y_min,
+                            "w": x_max - x_min, "h": y_max - y_min},
+                "bbox_pct": {
+                    "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                    "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                    "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                    "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                },
+                "word_boxes": all_wb,
+                "ocr_engine": "words_first",
+                "is_bold": False,
+            })
+
+            for row in rows:
+                if row["index"] == hri:
+                    row["is_header"] = True
+            heading_count += 1
+
+    return heading_count
+
+
+def _detect_header_rows(
+    rows: List[Dict],
+    zone_words: List[Dict],
+    zone_y: int,
+    columns: Optional[List[Dict]] = None,
+    skip_first_row_header: bool = False,
+) -> List[int]:
+    """Detect header rows: first-row heuristic + spanning header detection.
+
+    A "spanning header" is a row whose words stretch across multiple column
+    boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
+    """
+    if len(rows) < 2:
+        return []
+
+    headers = []
+
+    if not skip_first_row_header:
+        first_row = rows[0]
+        second_row = rows[1]
+
+        # Gap between first and second row > 0.5x average row height
+        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
+        gap = second_row["y_min"] - first_row["y_max"]
+        if gap > avg_h * 0.5:
+            headers.append(0)
+
+        # Also check if first row words are taller than average (bold/header text)
+        all_heights = [w["height"] for w in zone_words]
+        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
+        first_row_words = [
+            w for w in zone_words
+            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
+        ]
+        if first_row_words:
+            first_h = max(w["height"] for w in first_row_words)
+            if first_h > median_h * 1.3:
+                if 0 not in headers:
+                    headers.append(0)
+
+    # Note: Spanning-header detection (rows spanning all columns) has been
+    # disabled because it produces too many false positives on vocabulary
+    # worksheets where IPA transcriptions or short entries naturally span
+    # multiple columns with few words.  The first-row heuristic above is
+    # sufficient for detecting real headers.
+
+    return headers
+
+
+def _detect_colspan_cells(
+    zone_words: List[Dict],
+    columns: List[Dict],
+    rows: List[Dict],
+    cells: List[Dict],
+    img_w: int,
+    img_h: int,
+) -> List[Dict]:
+    """Detect and merge cells that span multiple columns (colspan).
+
+    A word-block (PaddleOCR phrase) that extends significantly past a column
+    boundary into the next column indicates a merged cell.  This replaces
+    the incorrectly split cells with a single cell spanning multiple columns.
+
+    Works for both full-page scans and box zones.
+    """
+    if len(columns) < 2 or not zone_words or not rows:
+        return cells
+
+    from cv_words_first import _assign_word_to_row
+
+    # Column boundaries (midpoints between adjacent columns)
+    col_boundaries = []
+    for ci in range(len(columns) - 1):
+        col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
+
+    def _cols_covered(w_left: float, w_right: float) -> List[int]:
+        """Return list of column indices that a word-block covers."""
+        covered = []
+        for col in columns:
+            col_mid = (col["x_min"] + col["x_max"]) / 2
+            # Word covers a column if it extends past the column's midpoint
+            if w_left < col_mid < w_right:
+                covered.append(col["index"])
+            # Also include column if word starts within it
+            elif col["x_min"] <= w_left < col["x_max"]:
+                covered.append(col["index"])
+        return sorted(set(covered))
+
+    # Group original word-blocks by row
+    row_word_blocks: Dict[int, List[Dict]] = {}
+    for w in zone_words:
+        ri = _assign_word_to_row(w, rows)
+        row_word_blocks.setdefault(ri, []).append(w)
+
+    # For each row, check if any word-block spans multiple columns
+    rows_to_merge: Dict[int, List[Dict]] = {}  # row_index -> list of spanning word-blocks
+
+    for ri, wblocks in row_word_blocks.items():
+        spanning = []
+        for w in wblocks:
+            w_left = w["left"]
+            w_right = w_left + w["width"]
+            covered = _cols_covered(w_left, w_right)
+            if len(covered) >= 2:
+                spanning.append({"word": w, "cols": covered})
+        if spanning:
+            rows_to_merge[ri] = spanning
+
+    if not rows_to_merge:
+        return cells
+
+    # Merge cells for spanning rows
+    new_cells = []
+    for cell in cells:
+        ri = cell.get("row_index", -1)
+        if ri not in rows_to_merge:
+            new_cells.append(cell)
+            continue
+
+        # Check if this cell's column is part of a spanning block
+        ci = cell.get("col_index", -1)
+        is_part_of_span = False
+        for span in rows_to_merge[ri]:
+            if ci in span["cols"]:
+                is_part_of_span = True
+                # Only emit the merged cell for the FIRST column in the span
+                if ci == span["cols"][0]:
+                    # Use the ORIGINAL word-block text (not the split cell texts
+                    # which may have broken words like "euros a" + "nd cents")
+                    orig_word = span["word"]
+                    merged_text = orig_word.get("text", "").strip()
+                    all_wb = [orig_word]
+
+                    # Compute merged bbox
+                    if all_wb:
+                        x_min = min(wb["left"] for wb in all_wb)
+                        y_min = min(wb["top"] for wb in all_wb)
+                        x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                        y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+                    else:
+                        x_min = y_min = x_max = y_max = 0
+
+                    new_cells.append({
+                        "cell_id": cell["cell_id"],
+                        "row_index": ri,
+                        "col_index": span["cols"][0],
+                        "col_type": "spanning_header",
+                        "colspan": len(span["cols"]),
+                        "text": merged_text,
+                        "confidence": cell.get("confidence", 0),
+                        "bbox_px": {"x": x_min, "y": y_min,
+                                    "w": x_max - x_min, "h": y_max - y_min},
+                        "bbox_pct": {
+                            "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                            "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                            "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                            "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                        },
+                        "word_boxes": all_wb,
+                        "ocr_engine": cell.get("ocr_engine", ""),
+                        "is_bold": cell.get("is_bold", False),
+                    })
+                    logger.info(
+                        "colspan detected: row %d, cols %s -> merged %d cells (%r)",
+                        ri, span["cols"], len(span["cols"]), merged_text[:50],
+                    )
+                break
+        if not is_part_of_span:
+            new_cells.append(cell)
+
+    return new_cells
diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py
index e126dba..c75e161 100644
--- a/klausur-service/backend/grid_editor_helpers.py
+++ b/klausur-service/backend/grid_editor_helpers.py
@@ -1,1737 +1,58 @@
 """
-Grid Editor helper functions — filters, detectors, and zone grid building.
+Grid Editor helper functions — barrel re-export module.
 
-Extracted from grid_editor_api.py for maintainability.
-All functions are pure computation — no HTTP, DB, or session side effects.
+This file re-exports all public symbols from the split sub-modules
+so that existing ``from grid_editor_helpers import ...`` statements
+continue to work without changes.
+
+Sub-modules:
+  - grid_editor_columns  — column detection, cross-column splitting, marker merging
+  - grid_editor_filters  — word/zone filtering, border ghosts, decorative margins
+  - grid_editor_headers  — header/heading detection, colspan detection
+  - grid_editor_zones    — vertical dividers, zone splitting/merging, zone grid building
 
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 
-import logging
-import re
-from typing import Any, Dict, List, Optional, Tuple
-
-import cv2
-import numpy as np
-
-from cv_vocab_types import PageZone
-from cv_words_first import _cluster_rows, _build_cells
-from cv_ocr_engines import _text_has_garbled_ipa
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------------------------------------------------------
-# Cross-column word splitting
-# ---------------------------------------------------------------------------
-
-_spell_cache: Optional[Any] = None
-_spell_loaded = False
-
-
-def _is_recognized_word(text: str) -> bool:
-    """Check if *text* is a recognized German or English word.
-
-    Uses the spellchecker library (same as cv_syllable_detect.py).
-    Returns True for real words like "oder", "Kabel", "Zeitung".
-    Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
-    """
-    global _spell_cache, _spell_loaded
-    if not text or len(text) < 2:
-        return False
-
-    if not _spell_loaded:
-        _spell_loaded = True
-        try:
-            from spellchecker import SpellChecker
-            _spell_cache = SpellChecker(language="de")
-        except Exception:
-            pass
-
-    if _spell_cache is None:
-        return False
-
-    return text.lower() in _spell_cache
-
-
-def _split_cross_column_words(
-    words: List[Dict],
-    columns: List[Dict],
-) -> List[Dict]:
-    """Split word boxes that span across column boundaries.
-
-    When OCR merges adjacent words from different columns (e.g. "sichzie"
-    spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
-    split the word box at the column boundary so each piece is assigned
-    to the correct column.
-
-    Only splits when:
-    - The word has significant overlap (>15% of its width) on both sides
-    - AND the word is not a recognized real word (OCR merge artifact), OR
-      the word contains a case transition (lowercase→uppercase) near the
-      boundary indicating two merged words like "dasZimmer".
-    """
-    if len(columns) < 2:
-        return words
-
-    # Column boundaries = midpoints between adjacent column edges
-    boundaries = []
-    for i in range(len(columns) - 1):
-        boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
-        boundaries.append(boundary)
-
-    new_words: List[Dict] = []
-    split_count = 0
-    for w in words:
-        w_left = w["left"]
-        w_width = w["width"]
-        w_right = w_left + w_width
-        text = (w.get("text") or "").strip()
-
-        if not text or len(text) < 4 or w_width < 10:
-            new_words.append(w)
-            continue
-
-        # Find the first boundary this word straddles significantly
-        split_boundary = None
-        for b in boundaries:
-            if w_left < b < w_right:
-                left_part = b - w_left
-                right_part = w_right - b
-                # Both sides must have at least 15% of the word width
-                if left_part > w_width * 0.15 and right_part > w_width * 0.15:
-                    split_boundary = b
-                    break
-
-        if split_boundary is None:
-            new_words.append(w)
-            continue
-
-        # Compute approximate split position in the text.
-        left_width = split_boundary - w_left
-        split_ratio = left_width / w_width
-        approx_pos = len(text) * split_ratio
-
-        # Strategy 1: look for a case transition (lowercase→uppercase) near
-        # the approximate split point — e.g. "dasZimmer" splits at 'Z'.
-        split_char = None
-        search_lo = max(1, int(approx_pos) - 3)
-        search_hi = min(len(text), int(approx_pos) + 2)
-        for i in range(search_lo, search_hi):
-            if text[i - 1].islower() and text[i].isupper():
-                split_char = i
-                break
-
-        # Strategy 2: if no case transition, only split if the whole word
-        # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
-        # Real words like "oder", "Kabel", "Zeitung" must not be split.
-        if split_char is None:
-            clean = re.sub(r"[,;:.!?]+$", "", text)  # strip trailing punct
-            if _is_recognized_word(clean):
-                new_words.append(w)
-                continue
-            # Not a real word — use floor of proportional position
-            split_char = max(1, min(len(text) - 1, int(approx_pos)))
-
-        left_text = text[:split_char].rstrip()
-        right_text = text[split_char:].lstrip()
-
-        if len(left_text) < 2 or len(right_text) < 2:
-            new_words.append(w)
-            continue
-
-        right_width = w_width - round(left_width)
-        new_words.append({
-            **w,
-            "text": left_text,
-            "width": round(left_width),
-        })
-        new_words.append({
-            **w,
-            "text": right_text,
-            "left": round(split_boundary),
-            "width": right_width,
-        })
-        split_count += 1
-        logger.info(
-            "split cross-column word %r → %r + %r at boundary %.0f",
-            text, left_text, right_text, split_boundary,
-        )
-
-    if split_count:
-        logger.info("split %d cross-column word(s)", split_count)
-    return new_words
-
-
-def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
-    """Remove page-border decoration strip words BEFORE column detection.
-
-    Scans from each page edge inward to find the first significant x-gap
-    (>30 px).  If the edge cluster contains <15 % of total words, those
-    words are removed as border-strip artifacts (alphabet letters,
-    illustration fragments).
-
-    Must run BEFORE ``_build_zone_grid`` so that column detection only
-    sees real content words and doesn't produce inflated row counts.
-    """
-    if len(words) < 10:
-        return words, 0
-
-    sorted_words = sorted(words, key=lambda w: w.get("left", 0))
-    total = len(sorted_words)
-
-    # -- Left-edge scan (running max right-edge) --
-    left_count = 0
-    running_right = 0
-    for gi in range(total - 1):
-        running_right = max(
-            running_right,
-            sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
-        )
-        if sorted_words[gi + 1].get("left", 0) - running_right > 30:
-            left_count = gi + 1
-            break
-
-    # -- Right-edge scan (running min left) --
-    right_count = 0
-    running_left = sorted_words[-1].get("left", 0)
-    for gi in range(total - 1, 0, -1):
-        running_left = min(running_left, sorted_words[gi].get("left", 0))
-        prev_right = (
-            sorted_words[gi - 1].get("left", 0)
-            + sorted_words[gi - 1].get("width", 0)
-        )
-        if running_left - prev_right > 30:
-            right_count = total - gi
-            break
-
-    # Validate candidate strip: real border decorations are mostly short
-    # words (alphabet letters like "A", "Bb", stray marks).  Multi-word
-    # content like "der Ranzen" or "die Schals" (continuation of German
-    # translations) must NOT be removed.
-    def _is_decorative_strip(candidates: List[Dict]) -> bool:
-        if not candidates:
-            return False
-        short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
-        return short / len(candidates) >= 0.45
-
-    strip_ids: set = set()
-    if left_count > 0 and left_count / total < 0.20:
-        candidates = sorted_words[:left_count]
-        if _is_decorative_strip(candidates):
-            strip_ids = {id(w) for w in candidates}
-    elif right_count > 0 and right_count / total < 0.20:
-        candidates = sorted_words[total - right_count:]
-        if _is_decorative_strip(candidates):
-            strip_ids = {id(w) for w in candidates}
-
-    if not strip_ids:
-        return words, 0
-
-    return [w for w in words if id(w) not in strip_ids], len(strip_ids)
-
-
-def _cluster_columns_by_alignment(
-    words: List[Dict],
-    zone_w: int,
-    rows: List[Dict],
-) -> List[Dict[str, Any]]:
-    """Detect columns by clustering left-edge alignment across rows.
-
-    Hybrid approach:
-      1. Group words by row, find "group start" positions within each row
-         (words preceded by a large gap or first word in row)
-      2. Cluster group-start left-edges by X-proximity across rows
-      3. Filter by row coverage (how many rows have a group start here)
-      4. Merge nearby clusters
-      5. Build column boundaries
-
-    This filters out mid-phrase word positions (e.g. IPA transcriptions,
-    second words in multi-word entries) by only considering positions
-    where a new word group begins within a row.
-    """
-    if not words or not rows:
-        return []
-
-    total_rows = len(rows)
-    if total_rows == 0:
-        return []
-
-    # --- Group words by row ---
-    row_words: Dict[int, List[Dict]] = {}
-    for w in words:
-        y_center = w["top"] + w["height"] / 2
-        best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
-        row_words.setdefault(best["index"], []).append(w)
-
-    # --- Compute adaptive gap threshold for group-start detection ---
-    all_gaps: List[float] = []
-    for ri, rw_list in row_words.items():
-        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
-        for i in range(len(sorted_rw) - 1):
-            right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
-            gap = sorted_rw[i + 1]["left"] - right
-            if gap > 0:
-                all_gaps.append(gap)
-
-    if all_gaps:
-        sorted_gaps = sorted(all_gaps)
-        median_gap = sorted_gaps[len(sorted_gaps) // 2]
-        heights = [w["height"] for w in words if w.get("height", 0) > 0]
-        median_h = sorted(heights)[len(heights) // 2] if heights else 25
-
-        # For small word counts (boxes, sub-zones): PaddleOCR returns
-        # multi-word blocks, so ALL inter-word gaps are potential column
-        # boundaries.  Use a low threshold based on word height — any gap
-        # wider than ~1x median word height is a column separator.
-        if len(words) <= 60:
-            gap_threshold = max(median_h * 1.0, 25)
-            logger.info(
-                "alignment columns (small zone): gap_threshold=%.0f "
-                "(median_h=%.0f, %d words, %d gaps: %s)",
-                gap_threshold, median_h, len(words), len(sorted_gaps),
-                [int(g) for g in sorted_gaps[:10]],
-            )
-        else:
-            # Standard approach for large zones (full pages)
-            gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
-            # Cap at 25% of zone width
-            max_gap = zone_w * 0.25
-            if gap_threshold > max_gap > 30:
-                logger.info("alignment columns: capping gap_threshold %.0f → %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w)
-                gap_threshold = max_gap
-    else:
-        gap_threshold = 50
-
-    # --- Find group-start positions (left-edges that begin a new column) ---
-    start_positions: List[tuple] = []  # (left_edge, row_index)
-    for ri, rw_list in row_words.items():
-        sorted_rw = sorted(rw_list, key=lambda w: w["left"])
-        # First word in row is always a group start
-        start_positions.append((sorted_rw[0]["left"], ri))
-        for i in range(1, len(sorted_rw)):
-            right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
-            gap = sorted_rw[i]["left"] - right_prev
-            if gap >= gap_threshold:
-                start_positions.append((sorted_rw[i]["left"], ri))
-
-    start_positions.sort(key=lambda x: x[0])
-
-    logger.info(
-        "alignment columns: %d group-start positions from %d words "
-        "(gap_threshold=%.0f, %d rows)",
-        len(start_positions), len(words), gap_threshold, total_rows,
-    )
-
-    if not start_positions:
-        x_min = min(w["left"] for w in words)
-        x_max = max(w["left"] + w["width"] for w in words)
-        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
-
-    # --- Cluster group-start positions by X-proximity ---
-    tolerance = max(10, int(zone_w * 0.01))
-    clusters: List[Dict[str, Any]] = []
-    cur_edges = [start_positions[0][0]]
-    cur_rows = {start_positions[0][1]}
-
-    for left, row_idx in start_positions[1:]:
-        if left - cur_edges[-1] <= tolerance:
-            cur_edges.append(left)
-            cur_rows.add(row_idx)
-        else:
-            clusters.append({
-                "mean_x": int(sum(cur_edges) / len(cur_edges)),
-                "min_edge": min(cur_edges),
-                "max_edge": max(cur_edges),
-                "count": len(cur_edges),
-                "distinct_rows": len(cur_rows),
-                "row_coverage": len(cur_rows) / total_rows,
-            })
-            cur_edges = [left]
-            cur_rows = {row_idx}
-    clusters.append({
-        "mean_x": int(sum(cur_edges) / len(cur_edges)),
-        "min_edge": min(cur_edges),
-        "max_edge": max(cur_edges),
-        "count": len(cur_edges),
-        "distinct_rows": len(cur_rows),
-        "row_coverage": len(cur_rows) / total_rows,
-    })
-
-    # --- Filter by row coverage ---
-    # These thresholds must be high enough to avoid false columns in flowing
-    # text (random inter-word gaps) while still detecting real columns in
-    # vocabulary worksheets (which typically have >80% row coverage).
-    MIN_COVERAGE_PRIMARY = 0.35
-    MIN_COVERAGE_SECONDARY = 0.12
-    MIN_WORDS_SECONDARY = 4
-    MIN_DISTINCT_ROWS = 3
-
-    # Content boundary for left-margin detection
-    content_x_min = min(w["left"] for w in words)
-    content_x_max = max(w["left"] + w["width"] for w in words)
-    content_span = content_x_max - content_x_min
-
-    primary = [
-        c for c in clusters
-        if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
-        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
-    ]
-    primary_ids = {id(c) for c in primary}
-    secondary = [
-        c for c in clusters
-        if id(c) not in primary_ids
-        and c["row_coverage"] >= MIN_COVERAGE_SECONDARY
-        and c["count"] >= MIN_WORDS_SECONDARY
-        and c["distinct_rows"] >= MIN_DISTINCT_ROWS
-    ]
-
-    # Tertiary: narrow left-margin columns (page refs, markers) that have
-    # too few rows for secondary but are clearly left-aligned and separated
-    # from the main content.  These appear at the far left or far right and
-    # have a large gap to the nearest significant cluster.
-    used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
-    sig_xs = [c["mean_x"] for c in primary + secondary]
-
-    # Tertiary: clusters that are clearly to the LEFT of the first
-    # significant column (or RIGHT of the last).  If words consistently
-    # start at a position left of the established first column boundary,
-    # they MUST be a separate column — regardless of how few rows they
-    # cover.  The only requirement is a clear spatial gap.
-    MIN_COVERAGE_TERTIARY = 0.02  # at least 1 row effectively
-    tertiary = []
-    for c in clusters:
-        if id(c) in used_ids:
-            continue
-        if c["distinct_rows"] < 1:
-            continue
-        if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
-            continue
-        # Must be near left or right content margin (within 15%)
-        rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
-        if not (rel_pos < 0.15 or rel_pos > 0.85):
-            continue
-        # Must have significant gap to nearest significant cluster
-        if sig_xs:
-            min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
-            if min_dist < max(30, content_span * 0.02):
-                continue
-        tertiary.append(c)
-
-    if tertiary:
-        for c in tertiary:
-            logger.info(
-                "  tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
-                c["mean_x"], c["min_edge"], c["max_edge"],
-                c["count"], c["distinct_rows"], c["row_coverage"] * 100,
-            )
-
-    significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
-
-    for c in significant:
-        logger.info(
-            "  significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
-            c["mean_x"], c["min_edge"], c["max_edge"],
-            c["count"], c["distinct_rows"], c["row_coverage"] * 100,
-        )
-    logger.info(
-        "alignment columns: %d clusters, %d primary, %d secondary → %d significant",
-        len(clusters), len(primary), len(secondary), len(significant),
-    )
-
-    if not significant:
-        # Fallback: single column covering all content
-        x_min = min(w["left"] for w in words)
-        x_max = max(w["left"] + w["width"] for w in words)
-        return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
-
-    # --- Merge nearby clusters ---
-    merge_distance = max(25, int(zone_w * 0.03))
-    merged = [significant[0].copy()]
-    for s in significant[1:]:
-        if s["mean_x"] - merged[-1]["mean_x"] < merge_distance:
-            prev = merged[-1]
-            total = prev["count"] + s["count"]
-            prev["mean_x"] = (
-                prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"]
-            ) // total
-            prev["count"] = total
-            prev["min_edge"] = min(prev["min_edge"], s["min_edge"])
-            prev["max_edge"] = max(prev["max_edge"], s["max_edge"])
-            prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"])
-        else:
-            merged.append(s.copy())
-
-    logger.info(
-        "alignment columns: %d after merge (distance=%d)",
-        len(merged), merge_distance,
-    )
-
-    # --- Build column boundaries ---
-    margin = max(5, int(zone_w * 0.005))
-    content_x_min = min(w["left"] for w in words)
-    content_x_max = max(w["left"] + w["width"] for w in words)
-
-    columns: List[Dict[str, Any]] = []
-    for i, cluster in enumerate(merged):
-        x_min = max(content_x_min, cluster["min_edge"] - margin)
-        if i + 1 < len(merged):
-            x_max = merged[i + 1]["min_edge"] - margin
-        else:
-            x_max = content_x_max
-
-        columns.append({
-            "index": i,
-            "type": f"column_{i + 1}" if len(merged) > 1 else "column_text",
-            "x_min": x_min,
-            "x_max": x_max,
-        })
-
-    return columns
-
-
-# Characters that are typically OCR artefacts from box border lines.
-# Intentionally excludes ! (red markers) and . , ; (real punctuation).
-_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~=+")
-
-
-def _filter_border_ghosts(
-    words: List[Dict],
-    boxes: List,
-) -> tuple:
-    """Remove words sitting on box borders that are OCR artefacts.
-
-    Returns (filtered_words, removed_count).
-    """
-    if not boxes or not words:
-        return words, 0
-
-    # Build border bands from detected boxes
-    x_bands: List[tuple] = []
-    y_bands: List[tuple] = []
-    for b in boxes:
-        bt = (
-            b.border_thickness
-            if hasattr(b, "border_thickness")
-            else b.get("border_thickness", 3)
-        )
-        # Skip borderless boxes (images/graphics) — no border line to produce ghosts
-        if bt == 0:
-            continue
-        bx = b.x if hasattr(b, "x") else b.get("x", 0)
-        by = b.y if hasattr(b, "y") else b.get("y", 0)
-        bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
-        bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
-        margin = max(bt * 2, 10) + 6
-        x_bands.append((bx - margin, bx + margin))
-        x_bands.append((bx + bw - margin, bx + bw + margin))
-        y_bands.append((by - margin, by + margin))
-        y_bands.append((by + bh - margin, by + bh + margin))
-
-    def _is_ghost(w: Dict) -> bool:
-        text = (w.get("text") or "").strip()
-        if not text:
-            return False
-        # Check if any word edge (not just center) touches a border band
-        w_left = w["left"]
-        w_right = w["left"] + w["width"]
-        w_top = w["top"]
-        w_bottom = w["top"] + w["height"]
-        on_border = (
-            any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
-            or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
-        )
-        if not on_border:
-            return False
-        if len(text) == 1 and text in _GRID_GHOST_CHARS:
-            return True
-        return False
-
-    filtered = [w for w in words if not _is_ghost(w)]
-    return filtered, len(words) - len(filtered)
-
-
-_MARKER_CHARS = set("•*·-–—|~=+#>→►▸▪◆○●□■✓✗✔✘")
-
-
-def _merge_inline_marker_columns(
-    columns: List[Dict],
-    words: List[Dict],
-) -> List[Dict]:
-    """Merge narrow marker columns (bullets, numbering) into adjacent text.
-
-    Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
-    at the left edge of a zone.  These are inline markers that indent text,
-    not real separate columns.  Merge them with their right neighbour.
-
-    Does NOT merge columns containing alphabetic words like "to", "in",
-    "der", "die", "das" — those are legitimate content columns.
-    """
-    if len(columns) < 2:
-        return columns
-
-    merged: List[Dict] = []
-    skip: set = set()
-
-    for i, col in enumerate(columns):
-        if i in skip:
-            continue
-
-        # Find words in this column
-        col_words = [
-            w for w in words
-            if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
-        ]
-        col_width = col["x_max"] - col["x_min"]
-
-        # Narrow column with mostly short words → MIGHT be inline markers
-        if col_words and col_width < 80:
-            avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
-            if avg_len <= 2 and i + 1 < len(columns):
-                # Check if words are actual markers (symbols/numbers) vs
-                # real alphabetic words like "to", "in", "der", "die"
-                texts = [(w.get("text") or "").strip() for w in col_words]
-                alpha_count = sum(
-                    1 for t in texts
-                    if t and t[0].isalpha() and t not in _MARKER_CHARS
-                )
-                alpha_ratio = alpha_count / len(texts) if texts else 0
-
-                # If ≥50% of words are alphabetic, this is a real column
-                if alpha_ratio >= 0.5:
-                    logger.info(
-                        "  kept narrow column %d (w=%d, avg_len=%.1f, "
-                        "alpha=%.0f%%) — contains real words",
-                        i, col_width, avg_len, alpha_ratio * 100,
-                    )
-                else:
-                    # Merge into next column
-                    next_col = columns[i + 1].copy()
-                    next_col["x_min"] = col["x_min"]
-                    merged.append(next_col)
-                    skip.add(i + 1)
-                    logger.info(
-                        "  merged inline marker column %d (w=%d, avg_len=%.1f) "
-                        "into column %d",
-                        i, col_width, avg_len, i + 1,
-                    )
-                    continue
-
-        merged.append(col)
-
-    # Re-index
-    for i, col in enumerate(merged):
-        col["index"] = i
-        col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
-
-    return merged
-
-
-def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
-    """Extract all word_boxes from cells into a flat list of word dicts."""
-    words: List[Dict] = []
-    for cell in cells:
-        for wb in cell.get("word_boxes") or []:
-            if wb.get("text", "").strip():
-                words.append({
-                    "text": wb["text"],
-                    "left": wb["left"],
-                    "top": wb["top"],
-                    "width": wb["width"],
-                    "height": wb["height"],
-                    "conf": wb.get("conf", 0),
-                })
-    return words
-
-
-def _words_in_zone(
-    words: List[Dict],
-    zone_y: int,
-    zone_h: int,
-    zone_x: int,
-    zone_w: int,
-) -> List[Dict]:
-    """Filter words whose Y-center falls within a zone's bounds."""
-    zone_y_end = zone_y + zone_h
-    zone_x_end = zone_x + zone_w
-    result = []
-    for w in words:
-        cy = w["top"] + w["height"] / 2
-        cx = w["left"] + w["width"] / 2
-        if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end:
-            result.append(w)
-    return result
-
-
-# ---------------------------------------------------------------------------
-# Vertical divider detection and zone splitting
-# ---------------------------------------------------------------------------
-
-_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
-
-
-def _detect_vertical_dividers(
-    words: List[Dict],
-    zone_x: int,
-    zone_w: int,
-    zone_y: int,
-    zone_h: int,
-) -> List[float]:
-    """Detect vertical divider lines from pipe word_boxes at consistent x.
-
-    Returns list of divider x-positions (empty if no dividers found).
-    """
-    if not words or zone_w <= 0 or zone_h <= 0:
-        return []
-
-    # Collect pipe word_boxes
-    pipes = [
-        w for w in words
-        if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
-    ]
-    if len(pipes) < 5:
-        return []
-
-    # Cluster pipe x-centers by proximity
-    tolerance = max(15, int(zone_w * 0.02))
-    pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
-
-    clusters: List[List[float]] = [[pipe_xs[0]]]
-    for x in pipe_xs[1:]:
-        if x - clusters[-1][-1] <= tolerance:
-            clusters[-1].append(x)
-        else:
-            clusters.append([x])
-
-    dividers: List[float] = []
-    for cluster in clusters:
-        if len(cluster) < 5:
-            continue
-        mean_x = sum(cluster) / len(cluster)
-        # Must be between 15% and 85% of zone width
-        rel_pos = (mean_x - zone_x) / zone_w
-        if rel_pos < 0.15 or rel_pos > 0.85:
-            continue
-        # Check vertical coverage: pipes must span >= 50% of zone height
-        cluster_pipes = [
-            w for w in pipes
-            if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
-        ]
-        ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
-        y_span = max(ys) - min(ys) if ys else 0
-        if y_span < zone_h * 0.5:
-            continue
-        dividers.append(mean_x)
-
-    return sorted(dividers)
-
-
-def _split_zone_at_vertical_dividers(
-    zone: "PageZone",
-    divider_xs: List[float],
-    vsplit_group_id: int,
-) -> List["PageZone"]:
-    """Split a PageZone at vertical divider positions into sub-zones."""
-    from cv_vocab_types import PageZone
-
-    boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
-    hints = []
-    for i in range(len(boundaries) - 1):
-        if i == 0:
-            hints.append("left_of_vsplit")
-        elif i == len(boundaries) - 2:
-            hints.append("right_of_vsplit")
-        else:
-            hints.append("middle_of_vsplit")
-
-    sub_zones = []
-    for i in range(len(boundaries) - 1):
-        x_start = int(boundaries[i])
-        x_end = int(boundaries[i + 1])
-        sub = PageZone(
-            index=0,  # re-indexed later
-            zone_type=zone.zone_type,
-            y=zone.y,
-            height=zone.height,
-            x=x_start,
-            width=x_end - x_start,
-            box=zone.box,
-            image_overlays=zone.image_overlays,
-            layout_hint=hints[i],
-            vsplit_group=vsplit_group_id,
-        )
-        sub_zones.append(sub)
-
-    return sub_zones
-
-
-def _merge_content_zones_across_boxes(
-    zones: List,
-    content_x: int,
-    content_w: int,
-) -> List:
-    """Merge content zones separated by box zones into single zones.
-
-    Box zones become image_overlays on the merged content zone.
-    Pattern: [content, box*, content] → [merged_content with overlay]
-    Box zones NOT between two content zones stay as standalone zones.
-    """
-    if len(zones) < 3:
-        return zones
-
-    # Group consecutive runs of [content, box+, content]
-    result: List = []
-    i = 0
-    while i < len(zones):
-        z = zones[i]
-        if z.zone_type != "content":
-            result.append(z)
-            i += 1
-            continue
-
-        # Start of a potential merge group: content zone
-        group_contents = [z]
-        group_boxes = []
-        j = i + 1
-        # Absorb [box, content] pairs — only absorb a box if it's
-        # confirmed to be followed by another content zone.
-        while j < len(zones):
-            if (zones[j].zone_type == "box"
-                    and j + 1 < len(zones)
-                    and zones[j + 1].zone_type == "content"):
-                group_boxes.append(zones[j])
-                group_contents.append(zones[j + 1])
-                j += 2
-            else:
-                break
-
-        if len(group_contents) >= 2 and group_boxes:
-            # Merge: create one large content zone spanning all
-            y_min = min(c.y for c in group_contents)
-            y_max = max(c.y + c.height for c in group_contents)
-            overlays = []
-            for bz in group_boxes:
-                overlay = {
-                    "y": bz.y,
-                    "height": bz.height,
-                    "x": bz.x,
-                    "width": bz.width,
-                }
-                if bz.box:
-                    overlay["box"] = {
-                        "x": bz.box.x,
-                        "y": bz.box.y,
-                        "width": bz.box.width,
-                        "height": bz.box.height,
-                        "confidence": bz.box.confidence,
-                        "border_thickness": bz.box.border_thickness,
-                    }
-                overlays.append(overlay)
-
-            merged = PageZone(
-                index=0,  # re-indexed below
-                zone_type="content",
-                y=y_min,
-                height=y_max - y_min,
-                x=content_x,
-                width=content_w,
-                image_overlays=overlays,
-            )
-            result.append(merged)
-            i = j
-        else:
-            # No merge possible — emit just the content zone
-            result.append(z)
-            i += 1
-
-    # Re-index zones
-    for idx, z in enumerate(result):
-        z.index = idx
-
-    logger.info(
-        "zone-merge: %d zones → %d zones after merging across boxes",
-        len(zones), len(result),
-    )
-    return result
-
-
-def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
-    """Detect heading rows by color + height after color annotation.
-
-    A row is a heading if:
-    1. ALL word_boxes have color_name != 'black' (typically 'blue')
-    2. Mean word height > 1.2x median height of all words in the zone
-
-    Detected heading rows are merged into a single spanning cell.
-    Returns count of headings detected.
-    """
-    heading_count = 0
-
-    for z in zones_data:
-        cells = z.get("cells", [])
-        rows = z.get("rows", [])
-        columns = z.get("columns", [])
-        if not cells or not rows or len(columns) < 2:
-            continue
-
-        # Compute median word height across the zone
-        all_heights = []
-        for cell in cells:
-            for wb in cell.get("word_boxes") or []:
-                h = wb.get("height", 0)
-                if h > 0:
-                    all_heights.append(h)
-        if not all_heights:
-            continue
-        all_heights_sorted = sorted(all_heights)
-        median_h = all_heights_sorted[len(all_heights_sorted) // 2]
-
-        heading_row_indices = []
-        for row in rows:
-            if row.get("is_header"):
-                continue  # already detected as header
-            ri = row["index"]
-            row_cells = [c for c in cells if c.get("row_index") == ri]
-            row_wbs = [
-                wb for cell in row_cells
-                for wb in cell.get("word_boxes") or []
-            ]
-            if not row_wbs:
-                continue
-
-            # Condition 1: ALL words are non-black
-            all_colored = all(
-                wb.get("color_name", "black") != "black"
-                for wb in row_wbs
-            )
-            if not all_colored:
-                continue
-
-            # Condition 2: mean height > 1.2x median
-            mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
-            if mean_h <= median_h * 1.2:
-                continue
-
-            heading_row_indices.append(ri)
-
-        # Merge heading cells into spanning cells
-        for hri in heading_row_indices:
-            header_cells = [c for c in cells if c.get("row_index") == hri]
-            if len(header_cells) <= 1:
-                # Single cell — just mark it as heading
-                if header_cells:
-                    header_cells[0]["col_type"] = "heading"
-                    heading_count += 1
-                    # Mark row as header
-                    for row in rows:
-                        if row["index"] == hri:
-                            row["is_header"] = True
-                continue
-
-            # Collect all word_boxes and text from all columns
-            all_wb = []
-            all_text_parts = []
-            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
-                all_wb.extend(hc.get("word_boxes", []))
-                if hc.get("text", "").strip():
-                    all_text_parts.append(hc["text"].strip())
-
-            # Remove all cells for this row, replace with one spanning cell
-            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
-
-            if all_wb:
-                x_min = min(wb["left"] for wb in all_wb)
-                y_min = min(wb["top"] for wb in all_wb)
-                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
-                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
-
-                # Use the actual starting col_index from the first cell
-                first_col = min(hc["col_index"] for hc in header_cells)
-                zone_idx = z.get("zone_index", 0)
-                z["cells"].append({
-                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
-                    "zone_index": zone_idx,
-                    "row_index": hri,
-                    "col_index": first_col,
-                    "col_type": "heading",
-                    "text": " ".join(all_text_parts),
-                    "confidence": 0.0,
-                    "bbox_px": {"x": x_min, "y": y_min,
-                                "w": x_max - x_min, "h": y_max - y_min},
-                    "bbox_pct": {
-                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
-                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
-                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
-                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
-                    },
-                    "word_boxes": all_wb,
-                    "ocr_engine": "words_first",
-                    "is_bold": True,
-                })
-
-            # Mark row as header
-            for row in rows:
-                if row["index"] == hri:
-                    row["is_header"] = True
-            heading_count += 1
-
-    return heading_count
-
-
-def _detect_heading_rows_by_single_cell(
-    zones_data: List[Dict], img_w: int, img_h: int,
-) -> int:
-    """Detect heading rows that have only a single content cell.
-
-    Black headings like "Theme" have normal color and height, so they are
-    missed by ``_detect_heading_rows_by_color``.  The distinguishing signal
-    is that they occupy only one column while normal vocabulary rows fill
-    at least 2-3 columns.
-
-    A row qualifies as a heading if:
-    1. It is not already marked as a header/heading.
-    2. It has exactly ONE cell whose col_type starts with ``column_``
-       (excluding column_1 / page_ref which only carries page numbers).
-    3. That single cell is NOT in the last column (continuation/example
-       lines like "2. Veränderung, Wechsel" often sit alone in column_4).
-    4. The text does not start with ``[`` (IPA continuation).
-    5. The zone has ≥3 columns and ≥5 rows (avoids false positives in
-       tiny zones).
-    6. The majority of rows in the zone have ≥2 content cells (ensures
-       we are in a multi-column vocab layout).
-    """
-    heading_count = 0
-
-    for z in zones_data:
-        cells = z.get("cells", [])
-        rows = z.get("rows", [])
-        columns = z.get("columns", [])
-        if len(columns) < 3 or len(rows) < 5:
-            continue
-
-        # Determine the last col_index (example/sentence column)
-        col_indices = sorted(set(c.get("col_index", 0) for c in cells))
-        if not col_indices:
-            continue
-        last_col = col_indices[-1]
-
-        # Count content cells per row (column_* but not column_1/page_ref).
-        # Exception: column_1 cells that contain a dictionary article word
-        # (die/der/das etc.) ARE content — they appear in dictionary layouts
-        # where the leftmost column holds grammatical articles.
-        _ARTICLE_WORDS = {
-            "die", "der", "das", "dem", "den", "des", "ein", "eine",
-            "the", "a", "an",
-        }
-        row_content_counts: Dict[int, int] = {}
-        for cell in cells:
-            ct = cell.get("col_type", "")
-            if not ct.startswith("column_"):
-                continue
-            if ct == "column_1":
-                ctext = (cell.get("text") or "").strip().lower()
-                if ctext not in _ARTICLE_WORDS:
-                    continue
-            ri = cell.get("row_index", -1)
-            row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
-
-        # Majority of rows must have ≥2 content cells
-        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
-        if multi_col_rows < len(rows) * 0.4:
-            continue
-
-        # Exclude first and last non-header rows — these are typically
-        # page numbers or footer text, not headings.
-        non_header_rows = [r for r in rows if not r.get("is_header")]
-        if len(non_header_rows) < 3:
-            continue
-        first_ri = non_header_rows[0]["index"]
-        last_ri = non_header_rows[-1]["index"]
-
-        heading_row_indices = []
-        for row in rows:
-            if row.get("is_header"):
-                continue
-            ri = row["index"]
-            if ri == first_ri or ri == last_ri:
-                continue
-            row_cells = [c for c in cells if c.get("row_index") == ri]
-            content_cells = [
-                c for c in row_cells
-                if c.get("col_type", "").startswith("column_")
-                and (c.get("col_type") != "column_1"
-                     or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
-            ]
-            if len(content_cells) != 1:
-                continue
-            cell = content_cells[0]
-            # Not in the last column (continuation/example lines)
-            if cell.get("col_index") == last_col:
-                continue
-            text = (cell.get("text") or "").strip()
-            if not text or text.startswith("["):
-                continue
-            # Continuation lines start with "(" — e.g. "(usw.)", "(TV-Serie)"
-            if text.startswith("("):
-                continue
-            # Single cell NOT in the first content column is likely a
-            # continuation/overflow line, not a heading.  Real headings
-            # ("Theme 1", "Unit 3: ...") appear in the first or second
-            # content column.
-            first_content_col = col_indices[0] if col_indices else 0
-            if cell.get("col_index", 0) > first_content_col + 1:
-                continue
-            # Skip garbled IPA without brackets (e.g. "ska:f – ska:vz")
-            # but NOT text with real IPA symbols (e.g. "Theme [θˈiːm]")
-            _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
-            if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
-                continue
-            # Guard: dictionary section headings are short (1-4 alpha chars
-            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
-            # lowercase is a regular vocabulary word (e.g. "zentral") that
-            # happens to appear alone in its row.
-            alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
-            if len(alpha_only) > 4 and text[0].islower():
-                continue
-            heading_row_indices.append(ri)
-
-        # Guard: if >25% of eligible rows would become headings, the
-        # heuristic is misfiring (e.g. sparse single-column layout where
-        # most rows naturally have only 1 content cell).
-        eligible_rows = len(non_header_rows) - 2  # minus first/last excluded
-        if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
-            logger.debug(
-                "Skipping single-cell heading detection for zone %s: "
-                "%d/%d rows would be headings (>25%%)",
-                z.get("zone_index"), len(heading_row_indices), eligible_rows,
-            )
-            continue
-
-        for hri in heading_row_indices:
-            header_cells = [c for c in cells if c.get("row_index") == hri]
-            if not header_cells:
-                continue
-
-            # Collect all word_boxes and text
-            all_wb = []
-            all_text_parts = []
-            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
-                all_wb.extend(hc.get("word_boxes", []))
-                if hc.get("text", "").strip():
-                    all_text_parts.append(hc["text"].strip())
-
-            first_col_idx = min(hc["col_index"] for hc in header_cells)
-
-            # Remove old cells for this row, add spanning heading cell
-            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
-
-            if all_wb:
-                x_min = min(wb["left"] for wb in all_wb)
-                y_min = min(wb["top"] for wb in all_wb)
-                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
-                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
-            else:
-                # Fallback to first cell bbox
-                bp = header_cells[0].get("bbox_px", {})
-                x_min = bp.get("x", 0)
-                y_min = bp.get("y", 0)
-                x_max = x_min + bp.get("w", 0)
-                y_max = y_min + bp.get("h", 0)
-
-            zone_idx = z.get("zone_index", 0)
-            z["cells"].append({
-                "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
-                "zone_index": zone_idx,
-                "row_index": hri,
-                "col_index": first_col_idx,
-                "col_type": "heading",
-                "text": " ".join(all_text_parts),
-                "confidence": 0.0,
-                "bbox_px": {"x": x_min, "y": y_min,
-                            "w": x_max - x_min, "h": y_max - y_min},
-                "bbox_pct": {
-                    "x": round(x_min / img_w * 100, 2) if img_w else 0,
-                    "y": round(y_min / img_h * 100, 2) if img_h else 0,
-                    "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
-                    "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
-                },
-                "word_boxes": all_wb,
-                "ocr_engine": "words_first",
-                "is_bold": False,
-            })
-
-            for row in rows:
-                if row["index"] == hri:
-                    row["is_header"] = True
-            heading_count += 1
-
-    return heading_count
-
-
-def _detect_header_rows(
-    rows: List[Dict],
-    zone_words: List[Dict],
-    zone_y: int,
-    columns: Optional[List[Dict]] = None,
-    skip_first_row_header: bool = False,
-) -> List[int]:
-    """Detect header rows: first-row heuristic + spanning header detection.
-
-    A "spanning header" is a row whose words stretch across multiple column
-    boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
-    """
-    if len(rows) < 2:
-        return []
-
-    headers = []
-
-    if not skip_first_row_header:
-        first_row = rows[0]
-        second_row = rows[1]
-
-        # Gap between first and second row > 0.5x average row height
-        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
-        gap = second_row["y_min"] - first_row["y_max"]
-        if gap > avg_h * 0.5:
-            headers.append(0)
-
-        # Also check if first row words are taller than average (bold/header text)
-        all_heights = [w["height"] for w in zone_words]
-        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
-        first_row_words = [
-            w for w in zone_words
-            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
-        ]
-        if first_row_words:
-            first_h = max(w["height"] for w in first_row_words)
-            if first_h > median_h * 1.3:
-                if 0 not in headers:
-                    headers.append(0)
-
-    # Note: Spanning-header detection (rows spanning all columns) has been
-    # disabled because it produces too many false positives on vocabulary
-    # worksheets where IPA transcriptions or short entries naturally span
-    # multiple columns with few words.  The first-row heuristic above is
-    # sufficient for detecting real headers.
-
-    return headers
-
-
-def _detect_colspan_cells(
-    zone_words: List[Dict],
-    columns: List[Dict],
-    rows: List[Dict],
-    cells: List[Dict],
-    img_w: int,
-    img_h: int,
-) -> List[Dict]:
-    """Detect and merge cells that span multiple columns (colspan).
-
-    A word-block (PaddleOCR phrase) that extends significantly past a column
-    boundary into the next column indicates a merged cell.  This replaces
-    the incorrectly split cells with a single cell spanning multiple columns.
-
-    Works for both full-page scans and box zones.
-    """
-    if len(columns) < 2 or not zone_words or not rows:
-        return cells
-
-    from cv_words_first import _assign_word_to_row
-
-    # Column boundaries (midpoints between adjacent columns)
-    col_boundaries = []
-    for ci in range(len(columns) - 1):
-        col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
-
-    def _cols_covered(w_left: float, w_right: float) -> List[int]:
-        """Return list of column indices that a word-block covers."""
-        covered = []
-        for col in columns:
-            col_mid = (col["x_min"] + col["x_max"]) / 2
-            # Word covers a column if it extends past the column's midpoint
-            if w_left < col_mid < w_right:
-                covered.append(col["index"])
-            # Also include column if word starts within it
-            elif col["x_min"] <= w_left < col["x_max"]:
-                covered.append(col["index"])
-        return sorted(set(covered))
-
-    # Group original word-blocks by row
-    row_word_blocks: Dict[int, List[Dict]] = {}
-    for w in zone_words:
-        ri = _assign_word_to_row(w, rows)
-        row_word_blocks.setdefault(ri, []).append(w)
-
-    # For each row, check if any word-block spans multiple columns
-    rows_to_merge: Dict[int, List[Dict]] = {}  # row_index → list of spanning word-blocks
-
-    for ri, wblocks in row_word_blocks.items():
-        spanning = []
-        for w in wblocks:
-            w_left = w["left"]
-            w_right = w_left + w["width"]
-            covered = _cols_covered(w_left, w_right)
-            if len(covered) >= 2:
-                spanning.append({"word": w, "cols": covered})
-        if spanning:
-            rows_to_merge[ri] = spanning
-
-    if not rows_to_merge:
-        return cells
-
-    # Merge cells for spanning rows
-    new_cells = []
-    for cell in cells:
-        ri = cell.get("row_index", -1)
-        if ri not in rows_to_merge:
-            new_cells.append(cell)
-            continue
-
-        # Check if this cell's column is part of a spanning block
-        ci = cell.get("col_index", -1)
-        is_part_of_span = False
-        for span in rows_to_merge[ri]:
-            if ci in span["cols"]:
-                is_part_of_span = True
-                # Only emit the merged cell for the FIRST column in the span
-                if ci == span["cols"][0]:
-                    # Use the ORIGINAL word-block text (not the split cell texts
-                    # which may have broken words like "euros a" + "nd cents")
-                    orig_word = span["word"]
-                    merged_text = orig_word.get("text", "").strip()
-                    all_wb = [orig_word]
-
-                    # Compute merged bbox
-                    if all_wb:
-                        x_min = min(wb["left"] for wb in all_wb)
-                        y_min = min(wb["top"] for wb in all_wb)
-                        x_max = max(wb["left"] + wb["width"] for wb in all_wb)
-                        y_max = max(wb["top"] + wb["height"] for wb in all_wb)
-                    else:
-                        x_min = y_min = x_max = y_max = 0
-
-                    new_cells.append({
-                        "cell_id": cell["cell_id"],
-                        "row_index": ri,
-                        "col_index": span["cols"][0],
-                        "col_type": "spanning_header",
-                        "colspan": len(span["cols"]),
-                        "text": merged_text,
-                        "confidence": cell.get("confidence", 0),
-                        "bbox_px": {"x": x_min, "y": y_min,
-                                    "w": x_max - x_min, "h": y_max - y_min},
-                        "bbox_pct": {
-                            "x": round(x_min / img_w * 100, 2) if img_w else 0,
-                            "y": round(y_min / img_h * 100, 2) if img_h else 0,
-                            "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
-                            "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
-                        },
-                        "word_boxes": all_wb,
-                        "ocr_engine": cell.get("ocr_engine", ""),
-                        "is_bold": cell.get("is_bold", False),
-                    })
-                    logger.info(
-                        "colspan detected: row %d, cols %s → merged %d cells (%r)",
-                        ri, span["cols"], len(span["cols"]), merged_text[:50],
-                    )
-                break
-        if not is_part_of_span:
-            new_cells.append(cell)
-
-    return new_cells
-
-
-def _build_zone_grid(
-    zone_words: List[Dict],
-    zone_x: int,
-    zone_y: int,
-    zone_w: int,
-    zone_h: int,
-    zone_index: int,
-    img_w: int,
-    img_h: int,
-    global_columns: Optional[List[Dict]] = None,
-    skip_first_row_header: bool = False,
-) -> Dict[str, Any]:
-    """Build columns, rows, cells for a single zone from its words.
-
-    Args:
-        global_columns: If provided, use these pre-computed column boundaries
-            instead of detecting columns per zone.  Used for content zones so
-            that all content zones (above/between/below boxes) share the same
-            column structure.  Box zones always detect columns independently.
-    """
-    if not zone_words:
-        return {
-            "columns": [],
-            "rows": [],
-            "cells": [],
-            "header_rows": [],
-        }
-
-    # Cluster rows first (needed for column alignment analysis)
-    rows = _cluster_rows(zone_words)
-
-    # Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
-    if len(zone_words) <= 60:
-        import statistics as _st
-        _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
-        _med_h = _st.median(_heights) if _heights else 20
-        _y_tol = max(_med_h * 0.5, 5)
-        logger.info(
-            "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f → %d rows",
-            zone_index, len(zone_words), _med_h, _y_tol, len(rows),
-        )
-        for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
-            logger.info(
-                "  zone %d word: y=%d x=%d h=%d w=%d '%s'",
-                zone_index, w['top'], w['left'], w['height'], w['width'],
-                w.get('text', '')[:40],
-            )
-        for r in rows:
-            logger.info(
-                "  zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
-                zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
-            )
-
-    # Use global columns if provided, otherwise detect per zone
-    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
-
-    # Merge inline marker columns (bullets, numbering) into adjacent text
-    if not global_columns:
-        columns = _merge_inline_marker_columns(columns, zone_words)
-
-    if not columns or not rows:
-        return {
-            "columns": [],
-            "rows": [],
-            "cells": [],
-            "header_rows": [],
-        }
-
-    # Split word boxes that straddle column boundaries (e.g. "sichzie"
-    # spanning Col 1 + Col 2).  Must happen after column detection and
-    # before cell assignment.
-    # Keep original words for colspan detection (split destroys span info).
-    original_zone_words = zone_words
-    if len(columns) >= 2:
-        zone_words = _split_cross_column_words(zone_words, columns)
-
-    # Build cells
-    cells = _build_cells(zone_words, columns, rows, img_w, img_h)
-
-    # --- Detect colspan (merged cells spanning multiple columns) ---
-    # Uses the ORIGINAL (pre-split) words to detect word-blocks that span
-    # multiple columns.  _split_cross_column_words would have destroyed
-    # this information by cutting words at column boundaries.
-    if len(columns) >= 2:
-        cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
-
-    # Prefix cell IDs with zone index
-    for cell in cells:
-        cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
-        cell["zone_index"] = zone_index
-
-    # Detect header rows (pass columns for spanning header detection)
-    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
-                                      skip_first_row_header=skip_first_row_header)
-
-    # Merge cells in spanning header rows into a single col-0 cell
-    if header_rows and len(columns) >= 2:
-        for hri in header_rows:
-            header_cells = [c for c in cells if c["row_index"] == hri]
-            if len(header_cells) <= 1:
-                continue
-            # Collect all word_boxes and text from all columns
-            all_wb = []
-            all_text_parts = []
-            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
-                all_wb.extend(hc.get("word_boxes", []))
-                if hc.get("text", "").strip():
-                    all_text_parts.append(hc["text"].strip())
-            # Remove all header cells, replace with one spanning cell
-            cells = [c for c in cells if c["row_index"] != hri]
-            if all_wb:
-                x_min = min(wb["left"] for wb in all_wb)
-                y_min = min(wb["top"] for wb in all_wb)
-                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
-                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
-                cells.append({
-                    "cell_id": f"R{hri:02d}_C0",
-                    "row_index": hri,
-                    "col_index": 0,
-                    "col_type": "spanning_header",
-                    "text": " ".join(all_text_parts),
-                    "confidence": 0.0,
-                    "bbox_px": {"x": x_min, "y": y_min,
-                                "w": x_max - x_min, "h": y_max - y_min},
-                    "bbox_pct": {
-                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
-                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
-                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
-                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
-                    },
-                    "word_boxes": all_wb,
-                    "ocr_engine": "words_first",
-                    "is_bold": True,
-                })
-
-    # Convert columns to output format with percentages
-    out_columns = []
-    for col in columns:
-        x_min = col["x_min"]
-        x_max = col["x_max"]
-        out_columns.append({
-            "index": col["index"],
-            "label": col["type"],
-            "x_min_px": round(x_min),
-            "x_max_px": round(x_max),
-            "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
-            "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
-            "bold": False,
-        })
-
-    # Convert rows to output format with percentages
-    out_rows = []
-    for row in rows:
-        out_rows.append({
-            "index": row["index"],
-            "y_min_px": round(row["y_min"]),
-            "y_max_px": round(row["y_max"]),
-            "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
-            "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
-            "is_header": row["index"] in header_rows,
-        })
-
-    return {
-        "columns": out_columns,
-        "rows": out_rows,
-        "cells": cells,
-        "header_rows": header_rows,
-        "_raw_columns": columns,  # internal: for propagation to other zones
-    }
-
-
-def _get_content_bounds(words: List[Dict]) -> tuple:
-    """Get content bounds from word positions."""
-    if not words:
-        return 0, 0, 0, 0
-    x_min = min(w["left"] for w in words)
-    y_min = min(w["top"] for w in words)
-    x_max = max(w["left"] + w["width"] for w in words)
-    y_max = max(w["top"] + w["height"] for w in words)
-    return x_min, y_min, x_max - x_min, y_max - y_min
-
-
-def _filter_decorative_margin(
-    words: List[Dict],
-    img_w: int,
-    log: Any,
-    session_id: str,
-) -> Dict[str, Any]:
-    """Remove words that belong to a decorative alphabet strip on a margin.
-
-    Some vocabulary worksheets have a vertical A–Z alphabet graphic along
-    the left or right edge.  OCR reads each letter as an isolated single-
-    character word.  These decorative elements are not content and confuse
-    column/row detection.
-
-    Detection criteria (phase 1 — find the strip using single-char words):
-      - Words are in the outer 30% of the page (left or right)
-      - Nearly all words are single characters (letters or digits)
-      - At least 8 such words form a vertical strip (≥8 unique Y positions)
-      - Average horizontal spread of the strip is small (< 80px)
-
-    Phase 2 — once a strip is confirmed, also remove any short word (≤3
-    chars) in the same narrow x-range.  This catches multi-char OCR
-    artifacts like "Vv" that belong to the same decorative element.
-
-    Modifies *words* in place.
-
-    Returns:
-        Dict with 'found' (bool), 'side' (str), 'letters_detected' (int).
-    """
-    no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0}
-    if not words or img_w <= 0:
-        return no_strip
-
-    margin_cutoff = img_w * 0.30
-    # Phase 1: find candidate strips using short words (1-2 chars).
-    # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
-    # rather than singles, so accept ≤2-char words as strip candidates.
-    left_strip = [
-        w for w in words
-        if len((w.get("text") or "").strip()) <= 2
-        and w["left"] + w.get("width", 0) / 2 < margin_cutoff
-    ]
-    right_strip = [
-        w for w in words
-        if len((w.get("text") or "").strip()) <= 2
-        and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
-    ]
-
-    for strip, side in [(left_strip, "left"), (right_strip, "right")]:
-        if len(strip) < 6:
-            continue
-        # Check vertical distribution: should have many distinct Y positions
-        y_centers = sorted(set(
-            int(w["top"] + w.get("height", 0) / 2) // 20 * 20  # bucket
-            for w in strip
-        ))
-        if len(y_centers) < 6:
-            continue
-        # Check horizontal compactness
-        x_positions = [w["left"] for w in strip]
-        x_min = min(x_positions)
-        x_max = max(x_positions)
-        x_spread = x_max - x_min
-        if x_spread > 80:
-            continue
-
-        # Phase 2: strip confirmed — also collect short words in same x-range
-        # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
-        strip_x_lo = x_min - 20
-        strip_x_hi = x_max + 60  # word width + tolerance
-        all_strip_words = [
-            w for w in words
-            if len((w.get("text") or "").strip()) <= 3
-            and strip_x_lo <= w["left"] <= strip_x_hi
-            and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
-                 if side == "left"
-                 else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
-        ]
-
-        strip_set = set(id(w) for w in all_strip_words)
-        before = len(words)
-        words[:] = [w for w in words if id(w) not in strip_set]
-        removed = before - len(words)
-        if removed:
-            log.info(
-                "build-grid session %s: removed %d decorative %s-margin words "
-                "(strip x=%d-%d)",
-                session_id, removed, side, strip_x_lo, strip_x_hi,
-            )
-        return {"found": True, "side": side, "letters_detected": len(strip)}
-
-    return no_strip
-
-
-def _filter_footer_words(
-    words: List[Dict],
-    img_h: int,
-    log: Any,
-    session_id: str,
-) -> Optional[Dict]:
-    """Remove isolated words in the bottom 5% of the page (page numbers).
-
-    Modifies *words* in place and returns a page_number metadata dict
-    if a page number was extracted, or None.
-    """
-    if not words or img_h <= 0:
-        return None
-    footer_y = img_h * 0.95
-    footer_words = [
-        w for w in words
-        if w["top"] + w.get("height", 0) / 2 > footer_y
-    ]
-    if not footer_words:
-        return None
-    # Only remove if footer has very few words (≤ 3) with short text
-    total_text = "".join((w.get("text") or "").strip() for w in footer_words)
-    if len(footer_words) <= 3 and len(total_text) <= 10:
-        # Extract page number metadata before removing
-        page_number_info = {
-            "text": total_text.strip(),
-            "y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
-        }
-        # Try to parse as integer
-        digits = "".join(c for c in total_text if c.isdigit())
-        if digits:
-            page_number_info["number"] = int(digits)
-
-        footer_set = set(id(w) for w in footer_words)
-        words[:] = [w for w in words if id(w) not in footer_set]
-        log.info(
-            "build-grid session %s: extracted page number '%s' and removed %d footer words",
-            session_id, total_text, len(footer_words),
-        )
-        return page_number_info
-    return None
-
-
-def _filter_header_junk(
-    words: List[Dict],
-    img_h: int,
-    log: Any,
-    session_id: str,
-) -> None:
-    """Remove OCR junk from header illustrations above the real content.
-
-    Textbook pages often have decorative header graphics (illustrations,
-    icons) that OCR reads as low-confidence junk characters.  Real content
-    typically starts further down the page.
-
-    Algorithm:
-      1. Find the "content start" — the first Y position where a dense
-         horizontal row of 3+ high-confidence words begins.
-      2. Above that line, remove words with conf < 75 and text ≤ 3 chars.
-         These are almost certainly OCR artifacts from illustrations.
-
-    Modifies *words* in place.
-    """
-    if not words or img_h <= 0:
-        return
-
-    # --- Find content start: first horizontal row with ≥3 high-conf words ---
-    # Sort words by Y
-    sorted_by_y = sorted(words, key=lambda w: w["top"])
-    content_start_y = 0
-    _ROW_TOLERANCE = img_h * 0.02  # words within 2% of page height = same row
-    _MIN_ROW_WORDS = 3
-    _MIN_CONF = 80
-
-    i = 0
-    while i < len(sorted_by_y):
-        row_y = sorted_by_y[i]["top"]
-        # Collect words in this row band
-        row_words = []
-        j = i
-        while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
-            row_words.append(sorted_by_y[j])
-            j += 1
-        # Count high-confidence words with real text (> 1 char)
-        high_conf = [
-            w for w in row_words
-            if w.get("conf", 0) >= _MIN_CONF
-            and len((w.get("text") or "").strip()) > 1
-        ]
-        if len(high_conf) >= _MIN_ROW_WORDS:
-            content_start_y = row_y
-            break
-        i = j if j > i else i + 1
-
-    if content_start_y <= 0:
-        return  # no clear content start found
-
-    # --- Remove low-conf short junk above content start ---
-    junk = [
-        w for w in words
-        if w["top"] + w.get("height", 0) < content_start_y
-        and w.get("conf", 0) < 75
-        and len((w.get("text") or "").strip()) <= 3
-    ]
-    if not junk:
-        return
-
-    junk_set = set(id(w) for w in junk)
-    before = len(words)
-    words[:] = [w for w in words if id(w) not in junk_set]
-    removed = before - len(words)
-    if removed:
-        log.info(
-            "build-grid session %s: removed %d header junk words above y=%d "
-            "(content start)",
-            session_id, removed, content_start_y,
-        )
-
+# --- Re-export: columns ---------------------------------------------------
+from grid_editor_columns import (  # noqa: F401
+    _is_recognized_word,
+    _split_cross_column_words,
+    _cluster_columns_by_alignment,
+    _MARKER_CHARS,
+    _merge_inline_marker_columns,
+)
+
+# --- Re-export: filters ----------------------------------------------------
+from grid_editor_filters import (  # noqa: F401
+    _filter_border_strip_words,
+    _GRID_GHOST_CHARS,
+    _filter_border_ghosts,
+    _flatten_word_boxes,
+    _words_in_zone,
+    _get_content_bounds,
+    _filter_decorative_margin,
+    _filter_footer_words,
+    _filter_header_junk,
+)
+
+# --- Re-export: headers ----------------------------------------------------
+from grid_editor_headers import (  # noqa: F401
+    _detect_heading_rows_by_color,
+    _detect_heading_rows_by_single_cell,
+    _detect_header_rows,
+    _detect_colspan_cells,
+)
+
+# --- Re-export: zones -------------------------------------------------------
+from grid_editor_zones import (  # noqa: F401
+    _PIPE_RE_VSPLIT,
+    _detect_vertical_dividers,
+    _split_zone_at_vertical_dividers,
+    _merge_content_zones_across_boxes,
+    _build_zone_grid,
+)
+
+# --- Re-export from cv_words_first (used by cv_box_layout.py) ---------------
+from cv_words_first import _cluster_rows  # noqa: F401
diff --git a/klausur-service/backend/grid_editor_zones.py b/klausur-service/backend/grid_editor_zones.py
new file mode 100644
index 0000000..2640c09
--- /dev/null
+++ b/klausur-service/backend/grid_editor_zones.py
@@ -0,0 +1,389 @@
+"""
+Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
+
+Split from grid_editor_helpers.py for maintainability.
+All functions are pure computation — no HTTP, DB, or session side effects.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from cv_vocab_types import PageZone
+from cv_words_first import _cluster_rows, _build_cells
+
+from grid_editor_columns import (
+    _cluster_columns_by_alignment,
+    _merge_inline_marker_columns,
+    _split_cross_column_words,
+)
+from grid_editor_headers import (
+    _detect_header_rows,
+    _detect_colspan_cells,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Vertical divider detection and zone splitting
+# ---------------------------------------------------------------------------
+
+_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
+
+
+def _detect_vertical_dividers(
+    words: List[Dict],
+    zone_x: int,
+    zone_w: int,
+    zone_y: int,
+    zone_h: int,
+) -> List[float]:
+    """Detect vertical divider lines from pipe word_boxes at consistent x.
+
+    Returns list of divider x-positions (empty if no dividers found).
+    """
+    if not words or zone_w <= 0 or zone_h <= 0:
+        return []
+
+    # Collect pipe word_boxes
+    pipes = [
+        w for w in words
+        if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
+    ]
+    if len(pipes) < 5:
+        return []
+
+    # Cluster pipe x-centers by proximity
+    tolerance = max(15, int(zone_w * 0.02))
+    pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
+
+    clusters: List[List[float]] = [[pipe_xs[0]]]
+    for x in pipe_xs[1:]:
+        if x - clusters[-1][-1] <= tolerance:
+            clusters[-1].append(x)
+        else:
+            clusters.append([x])
+
+    dividers: List[float] = []
+    for cluster in clusters:
+        if len(cluster) < 5:
+            continue
+        mean_x = sum(cluster) / len(cluster)
+        # Must be between 15% and 85% of zone width
+        rel_pos = (mean_x - zone_x) / zone_w
+        if rel_pos < 0.15 or rel_pos > 0.85:
+            continue
+        # Check vertical coverage: pipes must span >= 50% of zone height
+        cluster_pipes = [
+            w for w in pipes
+            if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
+        ]
+        ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
+        y_span = max(ys) - min(ys) if ys else 0
+        if y_span < zone_h * 0.5:
+            continue
+        dividers.append(mean_x)
+
+    return sorted(dividers)
+
+
+def _split_zone_at_vertical_dividers(
+    zone: "PageZone",
+    divider_xs: List[float],
+    vsplit_group_id: int,
+) -> List["PageZone"]:
+    """Split a PageZone at vertical divider positions into sub-zones."""
+    boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
+    hints = []
+    for i in range(len(boundaries) - 1):
+        if i == 0:
+            hints.append("left_of_vsplit")
+        elif i == len(boundaries) - 2:
+            hints.append("right_of_vsplit")
+        else:
+            hints.append("middle_of_vsplit")
+
+    sub_zones = []
+    for i in range(len(boundaries) - 1):
+        x_start = int(boundaries[i])
+        x_end = int(boundaries[i + 1])
+        sub = PageZone(
+            index=0,  # re-indexed later
+            zone_type=zone.zone_type,
+            y=zone.y,
+            height=zone.height,
+            x=x_start,
+            width=x_end - x_start,
+            box=zone.box,
+            image_overlays=zone.image_overlays,
+            layout_hint=hints[i],
+            vsplit_group=vsplit_group_id,
+        )
+        sub_zones.append(sub)
+
+    return sub_zones
+
+
+def _merge_content_zones_across_boxes(
+    zones: List,
+    content_x: int,
+    content_w: int,
+) -> List:
+    """Merge content zones separated by box zones into single zones.
+
+    Box zones become image_overlays on the merged content zone.
+    Pattern: [content, box*, content] -> [merged_content with overlay]
+    Box zones NOT between two content zones stay as standalone zones.
+    """
+    if len(zones) < 3:
+        return zones
+
+    # Group consecutive runs of [content, box+, content]
+    result: List = []
+    i = 0
+    while i < len(zones):
+        z = zones[i]
+        if z.zone_type != "content":
+            result.append(z)
+            i += 1
+            continue
+
+        # Start of a potential merge group: content zone
+        group_contents = [z]
+        group_boxes = []
+        j = i + 1
+        # Absorb [box, content] pairs -- only absorb a box if it's
+        # confirmed to be followed by another content zone.
+        while j < len(zones):
+            if (zones[j].zone_type == "box"
+                    and j + 1 < len(zones)
+                    and zones[j + 1].zone_type == "content"):
+                group_boxes.append(zones[j])
+                group_contents.append(zones[j + 1])
+                j += 2
+            else:
+                break
+
+        if len(group_contents) >= 2 and group_boxes:
+            # Merge: create one large content zone spanning all
+            y_min = min(c.y for c in group_contents)
+            y_max = max(c.y + c.height for c in group_contents)
+            overlays = []
+            for bz in group_boxes:
+                overlay = {
+                    "y": bz.y,
+                    "height": bz.height,
+                    "x": bz.x,
+                    "width": bz.width,
+                }
+                if bz.box:
+                    overlay["box"] = {
+                        "x": bz.box.x,
+                        "y": bz.box.y,
+                        "width": bz.box.width,
+                        "height": bz.box.height,
+                        "confidence": bz.box.confidence,
+                        "border_thickness": bz.box.border_thickness,
+                    }
+                overlays.append(overlay)
+
+            merged = PageZone(
+                index=0,  # re-indexed below
+                zone_type="content",
+                y=y_min,
+                height=y_max - y_min,
+                x=content_x,
+                width=content_w,
+                image_overlays=overlays,
+            )
+            result.append(merged)
+            i = j
+        else:
+            # No merge possible -- emit just the content zone
+            result.append(z)
+            i += 1
+
+    # Re-index zones
+    for idx, z in enumerate(result):
+        z.index = idx
+
+    logger.info(
+        "zone-merge: %d zones -> %d zones after merging across boxes",
+        len(zones), len(result),
+    )
+    return result
+
+
+def _build_zone_grid(
+    zone_words: List[Dict],
+    zone_x: int,
+    zone_y: int,
+    zone_w: int,
+    zone_h: int,
+    zone_index: int,
+    img_w: int,
+    img_h: int,
+    global_columns: Optional[List[Dict]] = None,
+    skip_first_row_header: bool = False,
+) -> Dict[str, Any]:
+    """Build columns, rows, cells for a single zone from its words.
+
+    Args:
+        global_columns: If provided, use these pre-computed column boundaries
+            instead of detecting columns per zone.  Used for content zones so
+            that all content zones (above/between/below boxes) share the same
+            column structure.  Box zones always detect columns independently.
+    """
+    if not zone_words:
+        return {
+            "columns": [],
+            "rows": [],
+            "cells": [],
+            "header_rows": [],
+        }
+
+    # Cluster rows first (needed for column alignment analysis)
+    rows = _cluster_rows(zone_words)
+
+    # Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
+    if len(zone_words) <= 60:
+        import statistics as _st
+        _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
+        _med_h = _st.median(_heights) if _heights else 20
+        _y_tol = max(_med_h * 0.5, 5)
+        logger.info(
+            "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
+            zone_index, len(zone_words), _med_h, _y_tol, len(rows),
+        )
+        for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
+            logger.info(
+                "  zone %d word: y=%d x=%d h=%d w=%d '%s'",
+                zone_index, w['top'], w['left'], w['height'], w['width'],
+                w.get('text', '')[:40],
+            )
+        for r in rows:
+            logger.info(
+                "  zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
+                zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
+            )
+
+    # Use global columns if provided, otherwise detect per zone
+    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
+
+    # Merge inline marker columns (bullets, numbering) into adjacent text
+    if not global_columns:
+        columns = _merge_inline_marker_columns(columns, zone_words)
+
+    if not columns or not rows:
+        return {
+            "columns": [],
+            "rows": [],
+            "cells": [],
+            "header_rows": [],
+        }
+
+    # Split word boxes that straddle column boundaries (e.g. "sichzie"
+    # spanning Col 1 + Col 2).  Must happen after column detection and
+    # before cell assignment.
+    # Keep original words for colspan detection (split destroys span info).
+    original_zone_words = zone_words
+    if len(columns) >= 2:
+        zone_words = _split_cross_column_words(zone_words, columns)
+
+    # Build cells
+    cells = _build_cells(zone_words, columns, rows, img_w, img_h)
+
+    # --- Detect colspan (merged cells spanning multiple columns) ---
+    # Uses the ORIGINAL (pre-split) words to detect word-blocks that span
+    # multiple columns.  _split_cross_column_words would have destroyed
+    # this information by cutting words at column boundaries.
+    if len(columns) >= 2:
+        cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
+
+    # Prefix cell IDs with zone index
+    for cell in cells:
+        cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
+        cell["zone_index"] = zone_index
+
+    # Detect header rows (pass columns for spanning header detection)
+    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
+                                      skip_first_row_header=skip_first_row_header)
+
+    # Merge cells in spanning header rows into a single col-0 cell
+    if header_rows and len(columns) >= 2:
+        for hri in header_rows:
+            header_cells = [c for c in cells if c["row_index"] == hri]
+            if len(header_cells) <= 1:
+                continue
+            # Collect all word_boxes and text from all columns
+            all_wb = []
+            all_text_parts = []
+            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
+                all_wb.extend(hc.get("word_boxes", []))
+                if hc.get("text", "").strip():
+                    all_text_parts.append(hc["text"].strip())
+            # Remove all header cells, replace with one spanning cell
+            cells = [c for c in cells if c["row_index"] != hri]
+            if all_wb:
+                x_min = min(wb["left"] for wb in all_wb)
+                y_min = min(wb["top"] for wb in all_wb)
+                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+                cells.append({
+                    "cell_id": f"R{hri:02d}_C0",
+                    "row_index": hri,
+                    "col_index": 0,
+                    "col_type": "spanning_header",
+                    "text": " ".join(all_text_parts),
+                    "confidence": 0.0,
+                    "bbox_px": {"x": x_min, "y": y_min,
+                                "w": x_max - x_min, "h": y_max - y_min},
+                    "bbox_pct": {
+                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                    },
+                    "word_boxes": all_wb,
+                    "ocr_engine": "words_first",
+                    "is_bold": True,
+                })
+
+    # Convert columns to output format with percentages
+    out_columns = []
+    for col in columns:
+        x_min = col["x_min"]
+        x_max = col["x_max"]
+        out_columns.append({
+            "index": col["index"],
+            "label": col["type"],
+            "x_min_px": round(x_min),
+            "x_max_px": round(x_max),
+            "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
+            "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
+            "bold": False,
+        })
+
+    # Convert rows to output format with percentages
+    out_rows = []
+    for row in rows:
+        out_rows.append({
+            "index": row["index"],
+            "y_min_px": round(row["y_min"]),
+            "y_max_px": round(row["y_max"]),
+            "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
+            "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
+            "is_header": row["index"] in header_rows,
+        })
+
+    return {
+        "columns": out_columns,
+        "rows": out_rows,
+        "cells": cells,
+        "header_rows": header_rows,
+        "_raw_columns": columns,  # internal: for propagation to other zones
+    }
diff --git a/klausur-service/backend/legal_corpus_chunking.py b/klausur-service/backend/legal_corpus_chunking.py
new file mode 100644
index 0000000..b3c47fc
--- /dev/null
+++ b/klausur-service/backend/legal_corpus_chunking.py
@@ -0,0 +1,197 @@
+"""
+Legal Corpus Chunking — Text splitting, semantic chunking, and HTML-to-text conversion.
+
+Provides German-aware sentence splitting, paragraph splitting, semantic chunking
+with overlap, and HTML-to-text conversion for legal document ingestion.
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+
+# German abbreviations that don't end sentences
+GERMAN_ABBREVIATIONS = {
+    'bzw', 'ca', 'chr', 'd.h', 'dr', 'etc', 'evtl', 'ggf', 'inkl', 'max',
+    'min', 'mio', 'mrd', 'nr', 'prof', 's', 'sog', 'u.a', 'u.ä', 'usw',
+    'v.a', 'vgl', 'vs', 'z.b', 'z.t', 'zzgl', 'abs', 'art', 'aufl',
+    'bd', 'betr', 'bzgl', 'dgl', 'ebd', 'hrsg', 'jg', 'kap', 'lt',
+    'rdnr', 'rn', 'std', 'str', 'tel', 'ua', 'uvm', 'va', 'zb',
+    'bsi', 'tr', 'owasp', 'iso', 'iec', 'din', 'en'
+}
+
+
+def split_into_sentences(text: str) -> List[str]:
+    """Split text into sentences with German language support."""
+    if not text:
+        return []
+
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    # Protect abbreviations
+    protected_text = text
+    for abbrev in GERMAN_ABBREVIATIONS:
+        pattern = re.compile(r'\b' + re.escape(abbrev) + r'\.', re.IGNORECASE)
+        protected_text = pattern.sub(abbrev.replace('.', '<DOT>') + '<ABBR>', protected_text)
+
+    # Protect decimal/ordinal numbers and requirement IDs (e.g., "O.Data_1")
+    protected_text = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', protected_text)
+    protected_text = re.sub(r'(\d+)\.(\s)', r'\1<ORD>\2', protected_text)
+    protected_text = re.sub(r'([A-Z])\.([A-Z])', r'\1<REQ>\2', protected_text)  # O.Data_1
+
+    # Split on sentence endings
+    sentence_pattern = r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9])|(?<=[.!?])$'
+    raw_sentences = re.split(sentence_pattern, protected_text)
+
+    # Restore protected characters
+    sentences = []
+    for s in raw_sentences:
+        s = s.replace('<DOT>', '.').replace('<ABBR>', '.').replace('<DECIMAL>', '.').replace('<ORD>', '.').replace('<REQ>', '.')
+        s = s.strip()
+        if s:
+            sentences.append(s)
+
+    return sentences
+
+
+def split_into_paragraphs(text: str) -> List[str]:
+    """Split text into paragraphs."""
+    if not text:
+        return []
+
+    raw_paragraphs = re.split(r'\n\s*\n', text)
+    return [para.strip() for para in raw_paragraphs if para.strip()]
+
+
+def chunk_text_semantic(
+    text: str,
+    chunk_size: int = 1000,
+    overlap: int = 200,
+) -> List[Tuple[str, int]]:
+    """
+    Semantic chunking that respects paragraph and sentence boundaries.
+    Matches NIBIS chunking strategy for consistency.
+
+    Returns list of (chunk_text, start_position) tuples.
+    """
+    if not text:
+        return []
+
+    if len(text) <= chunk_size:
+        return [(text.strip(), 0)]
+
+    paragraphs = split_into_paragraphs(text)
+    overlap_sentences = max(1, overlap // 100)  # Convert char overlap to sentence overlap
+
+    chunks = []
+    current_chunk_parts: List[str] = []
+    current_chunk_length = 0
+    chunk_start = 0
+    position = 0
+
+    for para in paragraphs:
+        if len(para) > chunk_size:
+            # Large paragraph: split into sentences
+            sentences = split_into_sentences(para)
+
+            for sentence in sentences:
+                sentence_len = len(sentence)
+
+                if sentence_len > chunk_size:
+                    # Very long sentence: save current chunk first
+                    if current_chunk_parts:
+                        chunk_text = ' '.join(current_chunk_parts)
+                        chunks.append((chunk_text, chunk_start))
+                        overlap_buffer = current_chunk_parts[-overlap_sentences:] if overlap_sentences > 0 else []
+                        current_chunk_parts = list(overlap_buffer)
+                        current_chunk_length = sum(len(s) + 1 for s in current_chunk_parts)
+
+                    # Add long sentence as its own chunk
+                    chunks.append((sentence, position))
+                    current_chunk_parts = [sentence]
+                    current_chunk_length = len(sentence) + 1
+                    position += sentence_len + 1
+                    continue
+
+                if current_chunk_length + sentence_len + 1 > chunk_size and current_chunk_parts:
+                    # Current chunk is full, save it
+                    chunk_text = ' '.join(current_chunk_parts)
+                    chunks.append((chunk_text, chunk_start))
+                    overlap_buffer = current_chunk_parts[-overlap_sentences:] if overlap_sentences > 0 else []
+                    current_chunk_parts = list(overlap_buffer)
+                    current_chunk_length = sum(len(s) + 1 for s in current_chunk_parts)
+                    chunk_start = position - current_chunk_length
+
+                current_chunk_parts.append(sentence)
+                current_chunk_length += sentence_len + 1
+                position += sentence_len + 1
+        else:
+            # Small paragraph: try to keep together
+            para_len = len(para)
+            if current_chunk_length + para_len + 2 > chunk_size and current_chunk_parts:
+                chunk_text = ' '.join(current_chunk_parts)
+                chunks.append((chunk_text, chunk_start))
+                last_para_sentences = split_into_sentences(current_chunk_parts[-1] if current_chunk_parts else "")
+                overlap_buffer = last_para_sentences[-overlap_sentences:] if overlap_sentences > 0 and last_para_sentences else []
+                current_chunk_parts = list(overlap_buffer)
+                current_chunk_length = sum(len(s) + 1 for s in current_chunk_parts)
+                chunk_start = position - current_chunk_length
+
+            if current_chunk_parts:
+                current_chunk_parts.append(para)
+                current_chunk_length += para_len + 2
+            else:
+                current_chunk_parts = [para]
+                current_chunk_length = para_len
+                chunk_start = position
+
+            position += para_len + 2
+
+    # Don't forget the last chunk
+    if current_chunk_parts:
+        chunk_text = ' '.join(current_chunk_parts)
+        chunks.append((chunk_text, chunk_start))
+
+    # Clean up whitespace
+    return [(re.sub(r'\s+', ' ', c).strip(), pos) for c, pos in chunks if c.strip()]
+
+
+def extract_article_info(text: str) -> Optional[Dict]:
+    """Extract article number and paragraph from text."""
+    # Pattern for "Artikel X" or "Art. X"
+    article_match = re.search(r'(?:Artikel|Art\.?)\s+(\d+)', text)
+    paragraph_match = re.search(r'(?:Absatz|Abs\.?)\s+(\d+)', text)
+
+    if article_match:
+        return {
+            "article": article_match.group(1),
+            "paragraph": paragraph_match.group(1) if paragraph_match else None,
+        }
+    return None
+
+
+def html_to_text(html_content: str) -> str:
+    """Convert HTML to clean text."""
+    # Remove script and style tags
+    html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
+    html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
+    # Remove comments
+    html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
+    # Replace common HTML entities
+    html_content = html_content.replace('&nbsp;', ' ')
+    html_content = html_content.replace('&amp;', '&')
+    html_content = html_content.replace('&lt;', '<')
+    html_content = html_content.replace('&gt;', '>')
+    html_content = html_content.replace('&quot;', '"')
+    # Convert breaks and paragraphs to newlines for better chunking
+    html_content = re.sub(r'<br\s*/?>', '\n', html_content, flags=re.IGNORECASE)
+    html_content = re.sub(r'</p>', '\n\n', html_content, flags=re.IGNORECASE)
+    html_content = re.sub(r'</div>', '\n', html_content, flags=re.IGNORECASE)
+    html_content = re.sub(r'</h[1-6]>', '\n\n', html_content, flags=re.IGNORECASE)
+    # Remove remaining HTML tags
+    text = re.sub(r'<[^>]+>', ' ', html_content)
+    # Clean up whitespace (but preserve paragraph breaks)
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'\n[ \t]+', '\n', text)
+    text = re.sub(r'[ \t]+\n', '\n', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text.strip()
diff --git a/klausur-service/backend/legal_corpus_ingestion.py b/klausur-service/backend/legal_corpus_ingestion.py
index 62b5e55..235b49c 100644
--- a/klausur-service/backend/legal_corpus_ingestion.py
+++ b/klausur-service/backend/legal_corpus_ingestion.py
@@ -8,6 +8,10 @@ Includes EU regulations, DACH national laws, and EDPB guidelines.
 Collections:
 - bp_legal_corpus: All regulation texts (GDPR, AI Act, CRA, BSI, etc.)
 
+Split modules:
+- legal_corpus_registry: Regulation dataclass + REGULATIONS list (pure data)
+- legal_corpus_chunking: Sentence/paragraph splitting, semantic chunking, HTML-to-text
+
 Usage:
     python legal_corpus_ingestion.py --ingest-all
     python legal_corpus_ingestion.py --ingest GDPR AIACT
@@ -19,11 +23,9 @@ import hashlib
 import json
 import logging
 import os
-import re
-from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
 from urllib.parse import urlparse
 
 import httpx
@@ -37,6 +39,17 @@ from qdrant_client.models import (
     VectorParams,
 )
 
+# Re-export for backward compatibility
+from legal_corpus_registry import Regulation, REGULATIONS  # noqa: F401
+from legal_corpus_chunking import (  # noqa: F401
+    chunk_text_semantic,
+    extract_article_info,
+    html_to_text,
+    split_into_sentences,
+    split_into_paragraphs,
+    GERMAN_ABBREVIATIONS,
+)
+
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -44,8 +57,6 @@ logger = logging.getLogger(__name__)
 # Configuration - Support both QDRANT_URL and QDRANT_HOST/PORT
 _qdrant_url = os.getenv("QDRANT_URL", "")
 if _qdrant_url:
-    # Parse URL: http://qdrant:6333 -> host=qdrant, port=6333
-    from urllib.parse import urlparse
     _parsed = urlparse(_qdrant_url)
     QDRANT_HOST = _parsed.hostname or "localhost"
     QDRANT_PORT = _parsed.port or 6333
@@ -61,614 +72,12 @@ CHUNK_SIZE = int(os.getenv("LEGAL_CHUNK_SIZE", "1000"))
 CHUNK_OVERLAP = int(os.getenv("LEGAL_CHUNK_OVERLAP", "200"))
 
 # Base path for local PDF/HTML files
-# In Docker: /app/docs/legal_corpus (mounted volume)
-# Local dev: relative to script location
 _default_docs_path = Path(__file__).parent.parent / "docs" / "legal_corpus"
 LEGAL_DOCS_PATH = Path(os.getenv("LEGAL_DOCS_PATH", str(_default_docs_path)))
-# Docker-specific override: if /app/docs exists, use it
 if Path("/app/docs/legal_corpus").exists():
     LEGAL_DOCS_PATH = Path("/app/docs/legal_corpus")
 
 
-@dataclass
-class Regulation:
-    """Regulation metadata."""
-    code: str
-    name: str
-    full_name: str
-    regulation_type: str
-    source_url: str
-    description: str
-    celex: Optional[str] = None  # CELEX number for EUR-Lex direct access
-    local_path: Optional[str] = None
-    language: str = "de"
-    requirement_count: int = 0
-
-
-# All regulations from Compliance Hub (EU + DACH national laws + guidelines)
-REGULATIONS: List[Regulation] = [
-    Regulation(
-        code="GDPR",
-        name="DSGVO",
-        full_name="Verordnung (EU) 2016/679 - Datenschutz-Grundverordnung",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
-        description="Grundverordnung zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten.",
-        celex="32016R0679",
-        requirement_count=99,
-    ),
-    Regulation(
-        code="EPRIVACY",
-        name="ePrivacy-Richtlinie",
-        full_name="Richtlinie 2002/58/EG",
-        regulation_type="eu_directive",
-        source_url="https://eur-lex.europa.eu/eli/dir/2002/58/oj/deu",
-        description="Datenschutz in der elektronischen Kommunikation, Cookies und Tracking.",
-        celex="32002L0058",
-        requirement_count=25,
-    ),
-    Regulation(
-        code="TDDDG",
-        name="TDDDG",
-        full_name="Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/ttdsg/TDDDG.pdf",
-        description="Deutsche Umsetzung der ePrivacy-Richtlinie (30 Paragraphen).",
-        requirement_count=30,
-    ),
-    Regulation(
-        code="SCC",
-        name="Standardvertragsklauseln",
-        full_name="Durchfuehrungsbeschluss (EU) 2021/914",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj/deu",
-        description="Standardvertragsklauseln fuer Drittlandtransfers.",
-        celex="32021D0914",
-        requirement_count=18,
-    ),
-    Regulation(
-        code="DPF",
-        name="EU-US Data Privacy Framework",
-        full_name="Durchfuehrungsbeschluss (EU) 2023/1795",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/dec_impl/2023/1795/oj",
-        description="Angemessenheitsbeschluss fuer USA-Transfers.",
-        celex="32023D1795",
-        requirement_count=12,
-    ),
-    Regulation(
-        code="AIACT",
-        name="EU AI Act",
-        full_name="Verordnung (EU) 2024/1689 - KI-Verordnung",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2024/1689/oj/deu",
-        description="EU-Verordnung zur Regulierung von KI-Systemen nach Risikostufen.",
-        celex="32024R1689",
-        requirement_count=85,
-    ),
-    Regulation(
-        code="CRA",
-        name="Cyber Resilience Act",
-        full_name="Verordnung (EU) 2024/2847",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2024/2847/oj/deu",
-        description="Cybersicherheitsanforderungen, SBOM-Pflicht.",
-        celex="32024R2847",
-        requirement_count=45,
-    ),
-    Regulation(
-        code="NIS2",
-        name="NIS2-Richtlinie",
-        full_name="Richtlinie (EU) 2022/2555",
-        regulation_type="eu_directive",
-        source_url="https://eur-lex.europa.eu/eli/dir/2022/2555/oj/deu",
-        description="Cybersicherheit fuer wesentliche Einrichtungen.",
-        celex="32022L2555",
-        requirement_count=46,
-    ),
-    Regulation(
-        code="EUCSA",
-        name="EU Cybersecurity Act",
-        full_name="Verordnung (EU) 2019/881",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2019/881/oj/deu",
-        description="ENISA und Cybersicherheitszertifizierung.",
-        celex="32019R0881",
-        requirement_count=35,
-    ),
-    Regulation(
-        code="DATAACT",
-        name="Data Act",
-        full_name="Verordnung (EU) 2023/2854",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2023/2854/oj/deu",
-        description="Fairer Datenzugang, IoT-Daten, Cloud-Wechsel.",
-        celex="32023R2854",
-        requirement_count=42,
-    ),
-    Regulation(
-        code="DGA",
-        name="Data Governance Act",
-        full_name="Verordnung (EU) 2022/868",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2022/868/oj/deu",
-        description="Weiterverwendung oeffentlicher Daten.",
-        celex="32022R0868",
-        requirement_count=35,
-    ),
-    Regulation(
-        code="DSA",
-        name="Digital Services Act",
-        full_name="Verordnung (EU) 2022/2065",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2022/2065/oj/deu",
-        description="Digitale Dienste, Transparenzpflichten.",
-        celex="32022R2065",
-        requirement_count=93,
-    ),
-    Regulation(
-        code="EAA",
-        name="European Accessibility Act",
-        full_name="Richtlinie (EU) 2019/882",
-        regulation_type="eu_directive",
-        source_url="https://eur-lex.europa.eu/eli/dir/2019/882/oj/deu",
-        description="Barrierefreiheit digitaler Produkte.",
-        celex="32019L0882",
-        requirement_count=25,
-    ),
-    Regulation(
-        code="DSM",
-        name="DSM-Urheberrechtsrichtlinie",
-        full_name="Richtlinie (EU) 2019/790",
-        regulation_type="eu_directive",
-        source_url="https://eur-lex.europa.eu/eli/dir/2019/790/oj/deu",
-        description="Urheberrecht, Text- und Data-Mining.",
-        celex="32019L0790",
-        requirement_count=22,
-    ),
-    Regulation(
-        code="PLD",
-        name="Produkthaftungsrichtlinie",
-        full_name="Richtlinie (EU) 2024/2853",
-        regulation_type="eu_directive",
-        source_url="https://eur-lex.europa.eu/eli/dir/2024/2853/oj/deu",
-        description="Produkthaftung inkl. Software und KI.",
-        celex="32024L2853",
-        requirement_count=18,
-    ),
-    Regulation(
-        code="GPSR",
-        name="General Product Safety",
-        full_name="Verordnung (EU) 2023/988",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2023/988/oj/deu",
-        description="Allgemeine Produktsicherheit.",
-        celex="32023R0988",
-        requirement_count=30,
-    ),
-    Regulation(
-        code="BSI-TR-03161-1",
-        name="BSI-TR-03161 Teil 1",
-        full_name="BSI Technische Richtlinie - Allgemeine Anforderungen",
-        regulation_type="bsi_standard",
-        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-1.pdf?__blob=publicationFile&v=6",
-        description="Allgemeine Sicherheitsanforderungen (45 Pruefaspekte).",
-        requirement_count=45,
-    ),
-    Regulation(
-        code="BSI-TR-03161-2",
-        name="BSI-TR-03161 Teil 2",
-        full_name="BSI Technische Richtlinie - Web-Anwendungen",
-        regulation_type="bsi_standard",
-        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-2.pdf?__blob=publicationFile&v=5",
-        description="Web-Sicherheit (40 Pruefaspekte).",
-        requirement_count=40,
-    ),
-    Regulation(
-        code="BSI-TR-03161-3",
-        name="BSI-TR-03161 Teil 3",
-        full_name="BSI Technische Richtlinie - Hintergrundsysteme",
-        regulation_type="bsi_standard",
-        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-3.pdf?__blob=publicationFile&v=5",
-        description="Backend-Sicherheit (35 Pruefaspekte).",
-        requirement_count=35,
-    ),
-    # Additional regulations for financial sector and health
-    Regulation(
-        code="DORA",
-        name="DORA",
-        full_name="Verordnung (EU) 2022/2554 - Digital Operational Resilience Act",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2022/2554/oj/deu",
-        description="Digitale operationale Resilienz fuer den Finanzsektor. IKT-Risikomanagement, Vorfallmeldung, Resilienz-Tests.",
-        celex="32022R2554",
-        requirement_count=64,
-    ),
-    Regulation(
-        code="PSD2",
-        name="PSD2",
-        full_name="Richtlinie (EU) 2015/2366 - Zahlungsdiensterichtlinie",
-        regulation_type="eu_directive",
-        source_url="https://eur-lex.europa.eu/eli/dir/2015/2366/oj/deu",
-        description="Zahlungsdienste im Binnenmarkt. Starke Kundenauthentifizierung, Open Banking APIs.",
-        celex="32015L2366",
-        requirement_count=117,
-    ),
-    Regulation(
-        code="AMLR",
-        name="AML-Verordnung",
-        full_name="Verordnung (EU) 2024/1624 - Geldwaeschebekaempfung",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2024/1624/oj/deu",
-        description="Verhinderung der Nutzung des Finanzsystems zur Geldwaesche und Terrorismusfinanzierung.",
-        celex="32024R1624",
-        requirement_count=89,
-    ),
-    Regulation(
-        code="EHDS",
-        name="EHDS",
-        full_name="Verordnung (EU) 2025/327 - Europaeischer Gesundheitsdatenraum",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2025/327/oj/deu",
-        description="Europaeischer Raum fuer Gesundheitsdaten. Primaer- und Sekundaernutzung von Gesundheitsdaten.",
-        celex="32025R0327",
-        requirement_count=95,
-    ),
-    Regulation(
-        code="MiCA",
-        name="MiCA",
-        full_name="Verordnung (EU) 2023/1114 - Markets in Crypto-Assets",
-        regulation_type="eu_regulation",
-        source_url="https://eur-lex.europa.eu/eli/reg/2023/1114/oj/deu",
-        description="Regulierung von Kryptowerten, Stablecoins und Crypto-Asset-Dienstleistern.",
-        celex="32023R1114",
-        requirement_count=149,
-    ),
-    # =====================================================================
-    # DACH National Laws — Deutschland (P1)
-    # =====================================================================
-    Regulation(
-        code="DE_DDG",
-        name="Digitale-Dienste-Gesetz",
-        full_name="Digitale-Dienste-Gesetz (DDG)",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/ddg/",
-        description="Deutsches Umsetzungsgesetz zum DSA. Regelt Impressumspflicht (§5), Informationspflichten fuer digitale Dienste und Cookies.",
-        requirement_count=30,
-    ),
-    Regulation(
-        code="DE_BGB_AGB",
-        name="BGB AGB-Recht",
-        full_name="BGB §§305-310, 312-312k — AGB und Fernabsatz",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/bgb/",
-        description="Deutsches AGB-Recht (§§305-310 BGB) und Fernabsatzrecht (§§312-312k BGB). Klauselverbote, Inhaltskontrolle, Widerrufsrecht, Button-Loesung.",
-        local_path="DE_BGB_AGB.txt",
-        requirement_count=40,
-    ),
-    Regulation(
-        code="DE_EGBGB",
-        name="EGBGB Art. 246-248",
-        full_name="Einfuehrungsgesetz zum BGB — Informationspflichten",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/bgbeg/",
-        description="Informationspflichten bei Verbrauchervertraegen (Art. 246), Fernabsatz (Art. 246a), E-Commerce (Art. 246c).",
-        local_path="DE_EGBGB.txt",
-        requirement_count=20,
-    ),
-    Regulation(
-        code="DE_UWG",
-        name="UWG Deutschland",
-        full_name="Gesetz gegen den unlauteren Wettbewerb (UWG)",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/uwg_2004/",
-        description="Unlauterer Wettbewerb: irrefuehrende Werbung, Spam-Verbot, Preisangaben, Online-Marketing-Regeln.",
-        requirement_count=25,
-    ),
-    Regulation(
-        code="DE_HGB_RET",
-        name="HGB Aufbewahrung",
-        full_name="HGB §§238-261, 257 — Handelsbuecher und Aufbewahrungsfristen",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/hgb/",
-        description="Buchfuehrungspflicht, Aufbewahrungsfristen 6/10 Jahre, Anforderungen an elektronische Aufbewahrung.",
-        local_path="DE_HGB_RET.txt",
-        requirement_count=15,
-    ),
-    Regulation(
-        code="DE_AO_RET",
-        name="AO Aufbewahrung",
-        full_name="Abgabenordnung §§140-148 — Steuerliche Aufbewahrungspflichten",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/ao_1977/",
-        description="Steuerliche Buchfuehrungs- und Aufbewahrungspflichten. 6/10 Jahre Fristen, Datenzugriff durch Finanzbehoerden.",
-        local_path="DE_AO_RET.txt",
-        requirement_count=12,
-    ),
-    Regulation(
-        code="DE_TKG",
-        name="TKG 2021",
-        full_name="Telekommunikationsgesetz 2021",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/tkg_2021/",
-        description="Telekommunikationsregulierung: Kundenschutz, Datenschutz, Vertragslaufzeiten, Netzinfrastruktur.",
-        requirement_count=45,
-    ),
-    # =====================================================================
-    # DACH National Laws — Oesterreich (P1)
-    # =====================================================================
-    Regulation(
-        code="AT_ECG",
-        name="E-Commerce-Gesetz AT",
-        full_name="E-Commerce-Gesetz (ECG) Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20001703",
-        description="Oesterreichisches E-Commerce-Gesetz: Impressum/Offenlegungspflicht (§5), Informationspflichten, Haftung von Diensteanbietern.",
-        language="de",
-        requirement_count=30,
-    ),
-    Regulation(
-        code="AT_TKG",
-        name="TKG 2021 AT",
-        full_name="Telekommunikationsgesetz 2021 Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20011678",
-        description="Oesterreichisches TKG: Cookie-Bestimmungen (§165), Kommunikationsgeheimnis, Endgeraetezugriff.",
-        language="de",
-        requirement_count=40,
-    ),
-    Regulation(
-        code="AT_KSCHG",
-        name="KSchG Oesterreich",
-        full_name="Konsumentenschutzgesetz (KSchG) Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002462",
-        description="Konsumentenschutz: AGB-Kontrolle (§6 Klauselverbote, §9 Verbandsklage), Ruecktrittsrecht, Informationspflichten.",
-        language="de",
-        requirement_count=35,
-    ),
-    Regulation(
-        code="AT_FAGG",
-        name="FAGG Oesterreich",
-        full_name="Fern- und Auswaertsgeschaefte-Gesetz (FAGG) Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20008847",
-        description="Fernabsatzrecht: Informationspflichten, Widerrufsrecht 14 Tage, Button-Loesung, Ausnahmen.",
-        language="de",
-        requirement_count=20,
-    ),
-    Regulation(
-        code="AT_UGB_RET",
-        name="UGB Aufbewahrung AT",
-        full_name="UGB §§189-216, 212 — Rechnungslegung und Aufbewahrung Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001702",
-        description="Oesterreichische Rechnungslegungspflicht und Aufbewahrungsfristen (7 Jahre). Buchfuehrung, Jahresabschluss.",
-        local_path="AT_UGB_RET.txt",
-        language="de",
-        requirement_count=15,
-    ),
-    Regulation(
-        code="AT_BAO_RET",
-        name="BAO §132 AT",
-        full_name="Bundesabgabenordnung §132 — Aufbewahrung Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10003940",
-        description="Steuerliche Aufbewahrungspflicht 7 Jahre fuer Buecher, Aufzeichnungen und Belege. Grundstuecke 22 Jahre.",
-        language="de",
-        requirement_count=5,
-    ),
-    Regulation(
-        code="AT_MEDIENG",
-        name="MedienG §§24-25 AT",
-        full_name="Mediengesetz §§24-25 Oesterreich — Impressum und Offenlegung",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10000719",
-        description="Impressum/Offenlegungspflicht fuer periodische Medien und Websites in Oesterreich.",
-        language="de",
-        requirement_count=10,
-    ),
-    # =====================================================================
-    # DACH National Laws — Schweiz (P1)
-    # =====================================================================
-    Regulation(
-        code="CH_DSV",
-        name="DSV Schweiz",
-        full_name="Datenschutzverordnung (DSV) Schweiz — SR 235.11",
-        regulation_type="ch_law",
-        source_url="https://www.fedlex.admin.ch/eli/cc/2022/568/de",
-        description="Ausfuehrungsverordnung zum revDSG: Meldepflichten, DSFA-Verfahren, Auslandtransfers, technische Massnahmen.",
-        language="de",
-        requirement_count=30,
-    ),
-    Regulation(
-        code="CH_OR_AGB",
-        name="OR AGB/Aufbewahrung CH",
-        full_name="Obligationenrecht — AGB-Kontrolle und Aufbewahrung Schweiz (SR 220)",
-        regulation_type="ch_law",
-        source_url="https://www.fedlex.admin.ch/eli/cc/27/317_321_377/de",
-        description="Art. 8 OR (AGB-Inhaltskontrolle), Art. 19/20 (Vertragsfreiheit), Art. 957-958f (Buchfuehrung, 10 Jahre Aufbewahrung).",
-        local_path="CH_OR_AGB.txt",
-        language="de",
-        requirement_count=20,
-    ),
-    Regulation(
-        code="CH_UWG",
-        name="UWG Schweiz",
-        full_name="Bundesgesetz gegen den unlauteren Wettbewerb Schweiz (SR 241)",
-        regulation_type="ch_law",
-        source_url="https://www.fedlex.admin.ch/eli/cc/1988/223_223_223/de",
-        description="Lauterkeitsrecht: Impressumspflicht, irrefuehrende Werbung, aggressive Verkaufsmethoden, AGB-Transparenz.",
-        language="de",
-        requirement_count=20,
-    ),
-    Regulation(
-        code="CH_FMG",
-        name="FMG Schweiz",
-        full_name="Fernmeldegesetz Schweiz (SR 784.10)",
-        regulation_type="ch_law",
-        source_url="https://www.fedlex.admin.ch/eli/cc/1997/2187_2187_2187/de",
-        description="Telekommunikationsregulierung: Fernmeldegeheimnis, Cookies/Tracking (Art. 45c), Spam-Verbot, Datenschutz.",
-        language="de",
-        requirement_count=25,
-    ),
-    # =====================================================================
-    # Deutschland P2
-    # =====================================================================
-    Regulation(
-        code="DE_PANGV",
-        name="PAngV",
-        full_name="Preisangabenverordnung (PAngV 2022)",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/pangv_2022/",
-        description="Preisangaben: Gesamtpreis, Grundpreis, Streichpreise (§11), Online-Preisauszeichnung.",
-        requirement_count=15,
-    ),
-    Regulation(
-        code="DE_DLINFOV",
-        name="DL-InfoV",
-        full_name="Dienstleistungs-Informationspflichten-Verordnung",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/dlinfov/",
-        description="Informationspflichten fuer Dienstleister: Identitaet, Kontakt, Berufshaftpflicht, AGB-Zugang.",
-        requirement_count=10,
-    ),
-    Regulation(
-        code="DE_BETRVG",
-        name="BetrVG §87",
-        full_name="Betriebsverfassungsgesetz §87 Abs.1 Nr.6",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/betrvg/",
-        description="Mitbestimmung bei technischer Ueberwachung: Betriebsrat-Beteiligung bei IT-Systemen, die Arbeitnehmerverhalten ueberwachen koennen.",
-        requirement_count=5,
-    ),
-    # =====================================================================
-    # Oesterreich P2
-    # =====================================================================
-    Regulation(
-        code="AT_ABGB_AGB",
-        name="ABGB AGB-Recht AT",
-        full_name="ABGB §§861-879, 864a — AGB-Kontrolle Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001622",
-        description="Geltungskontrolle (§864a), Sittenwidrigkeitskontrolle (§879 Abs.3), allgemeine Vertragsregeln.",
-        local_path="AT_ABGB_AGB.txt",
-        language="de",
-        requirement_count=10,
-    ),
-    Regulation(
-        code="AT_UWG",
-        name="UWG Oesterreich",
-        full_name="Bundesgesetz gegen den unlauteren Wettbewerb Oesterreich",
-        regulation_type="at_law",
-        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002665",
-        description="Lauterkeitsrecht AT: irrefuehrende Geschaeftspraktiken, aggressive Praktiken, Preisauszeichnung.",
-        language="de",
-        requirement_count=15,
-    ),
-    # =====================================================================
-    # Schweiz P2
-    # =====================================================================
-    Regulation(
-        code="CH_GEBUV",
-        name="GeBuV Schweiz",
-        full_name="Geschaeftsbuecher-Verordnung Schweiz (SR 221.431)",
-        regulation_type="ch_law",
-        source_url="https://www.fedlex.admin.ch/eli/cc/2002/468_468_468/de",
-        description="Ausfuehrungsvorschriften zur Buchfuehrung: elektronische Aufbewahrung, Integritaet, Datentraeger.",
-        language="de",
-        requirement_count=10,
-    ),
-    Regulation(
-        code="CH_ZERTES",
-        name="ZertES Schweiz",
-        full_name="Bundesgesetz ueber die elektronische Signatur (SR 943.03)",
-        regulation_type="ch_law",
-        source_url="https://www.fedlex.admin.ch/eli/cc/2016/752/de",
-        description="Elektronische Signatur und Zertifizierung: Qualifizierte Signaturen, Zertifizierungsdiensteanbieter.",
-        language="de",
-        requirement_count=10,
-    ),
-    # =====================================================================
-    # Deutschland P3
-    # =====================================================================
-    Regulation(
-        code="DE_GESCHGEHG",
-        name="GeschGehG",
-        full_name="Gesetz zum Schutz von Geschaeftsgeheimnissen",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/geschgehg/",
-        description="Schutz von Geschaeftsgeheimnissen: Definition, angemessene Geheimhaltungsmassnahmen, Reverse Engineering.",
-        requirement_count=10,
-    ),
-    Regulation(
-        code="DE_BSIG",
-        name="BSI-Gesetz",
-        full_name="Gesetz ueber das Bundesamt fuer Sicherheit in der Informationstechnik (BSIG)",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/bsig_2009/",
-        description="BSI-Aufgaben, KRITIS-Meldepflichten, IT-Sicherheitsstandards, Zertifizierung.",
-        requirement_count=20,
-    ),
-    Regulation(
-        code="DE_USTG_RET",
-        name="UStG §14b",
-        full_name="Umsatzsteuergesetz §14b — Aufbewahrung von Rechnungen",
-        regulation_type="de_law",
-        source_url="https://www.gesetze-im-internet.de/ustg_1980/",
-        description="Aufbewahrungspflicht fuer Rechnungen: 10 Jahre, Grundstuecke 20 Jahre, elektronische Aufbewahrung.",
-        local_path="DE_USTG_RET.txt",
-        requirement_count=5,
-    ),
-    # =====================================================================
-    # Schweiz P3
-    # =====================================================================
-    Regulation(
-        code="CH_ZGB_PERS",
-        name="ZGB Persoenlichkeitsschutz CH",
-        full_name="Zivilgesetzbuch Art. 28-28l — Persoenlichkeitsschutz Schweiz (SR 210)",
-        regulation_type="ch_law",
-        source_url="https://www.fedlex.admin.ch/eli/cc/24/233_245_233/de",
-        description="Persoenlichkeitsschutz: Recht am eigenen Bild, Schutz der Privatsphaere, Gegendarstellungsrecht.",
-        language="de",
-        requirement_count=8,
-    ),
-    # =====================================================================
-    # 3 fehlgeschlagene Quellen mit alternativen URLs nachholen
-    # =====================================================================
-    Regulation(
-        code="LU_DPA_LAW",
-        name="Datenschutzgesetz Luxemburg",
-        full_name="Loi du 1er aout 2018 — Datenschutzgesetz Luxemburg",
-        regulation_type="national_law",
-        source_url="https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
-        description="Luxemburgisches Datenschutzgesetz: Organisation der CNPD, nationale DSGVO-Ergaenzung.",
-        language="fr",
-        requirement_count=40,
-    ),
-    Regulation(
-        code="DK_DATABESKYTTELSESLOVEN",
-        name="Databeskyttelsesloven DK",
-        full_name="Databeskyttelsesloven — Datenschutzgesetz Daenemark",
-        regulation_type="national_law",
-        source_url="https://www.retsinformation.dk/eli/lta/2018/502",
-        description="Daenisches Datenschutzgesetz als ergaenzende Bestimmungen zur DSGVO. Reguliert durch Datatilsynet.",
-        language="da",
-        requirement_count=30,
-    ),
-    Regulation(
-        code="EDPB_GUIDELINES_1_2022",
-        name="EDPB GL Bussgelder",
-        full_name="EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
-        regulation_type="eu_guideline",
-        source_url="https://www.edpb.europa.eu/system/files/2023-05/edpb_guidelines_042022_calculationofadministrativefines_en.pdf",
-        description="EDPB-Leitlinien zur Berechnung von Verwaltungsbussgeldern unter der DSGVO.",
-        language="en",
-        requirement_count=15,
-    ),
-]
-
-
 class LegalCorpusIngestion:
     """Handles ingestion of legal documents into Qdrant."""
 
@@ -710,156 +119,24 @@ class LegalCorpusIngestion:
             logger.error(f"Embedding generation failed: {e}")
             raise
 
-    # German abbreviations that don't end sentences
-    GERMAN_ABBREVIATIONS = {
-        'bzw', 'ca', 'chr', 'd.h', 'dr', 'etc', 'evtl', 'ggf', 'inkl', 'max',
-        'min', 'mio', 'mrd', 'nr', 'prof', 's', 'sog', 'u.a', 'u.ä', 'usw',
-        'v.a', 'vgl', 'vs', 'z.b', 'z.t', 'zzgl', 'abs', 'art', 'aufl',
-        'bd', 'betr', 'bzgl', 'dgl', 'ebd', 'hrsg', 'jg', 'kap', 'lt',
-        'rdnr', 'rn', 'std', 'str', 'tel', 'ua', 'uvm', 'va', 'zb',
-        'bsi', 'tr', 'owasp', 'iso', 'iec', 'din', 'en'
-    }
+    # Delegate chunking/text methods to legal_corpus_chunking module
+    # Keep as instance methods for backward compatibility
+    GERMAN_ABBREVIATIONS = GERMAN_ABBREVIATIONS
 
     def _split_into_sentences(self, text: str) -> List[str]:
-        """Split text into sentences with German language support."""
-        if not text:
-            return []
-
-        text = re.sub(r'\s+', ' ', text).strip()
-
-        # Protect abbreviations
-        protected_text = text
-        for abbrev in self.GERMAN_ABBREVIATIONS:
-            pattern = re.compile(r'\b' + re.escape(abbrev) + r'\.', re.IGNORECASE)
-            protected_text = pattern.sub(abbrev.replace('.', '<DOT>') + '<ABBR>', protected_text)
-
-        # Protect decimal/ordinal numbers and requirement IDs (e.g., "O.Data_1")
-        protected_text = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', protected_text)
-        protected_text = re.sub(r'(\d+)\.(\s)', r'\1<ORD>\2', protected_text)
-        protected_text = re.sub(r'([A-Z])\.([A-Z])', r'\1<REQ>\2', protected_text)  # O.Data_1
-
-        # Split on sentence endings
-        sentence_pattern = r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9])|(?<=[.!?])$'
-        raw_sentences = re.split(sentence_pattern, protected_text)
-
-        # Restore protected characters
-        sentences = []
-        for s in raw_sentences:
-            s = s.replace('<DOT>', '.').replace('<ABBR>', '.').replace('<DECIMAL>', '.').replace('<ORD>', '.').replace('<REQ>', '.')
-            s = s.strip()
-            if s:
-                sentences.append(s)
-
-        return sentences
+        return split_into_sentences(text)
 
     def _split_into_paragraphs(self, text: str) -> List[str]:
-        """Split text into paragraphs."""
-        if not text:
-            return []
+        return split_into_paragraphs(text)
 
-        raw_paragraphs = re.split(r'\n\s*\n', text)
-        return [para.strip() for para in raw_paragraphs if para.strip()]
+    def _chunk_text_semantic(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP):
+        return chunk_text_semantic(text, chunk_size, overlap)
 
-    def _chunk_text_semantic(self, text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[Tuple[str, int]]:
-        """
-        Semantic chunking that respects paragraph and sentence boundaries.
-        Matches NIBIS chunking strategy for consistency.
+    def _extract_article_info(self, text: str):
+        return extract_article_info(text)
 
-        Returns list of (chunk_text, start_position) tuples.
-        """
-        if not text:
-            return []
-
-        if len(text) <= chunk_size:
-            return [(text.strip(), 0)]
-
-        paragraphs = self._split_into_paragraphs(text)
-        overlap_sentences = max(1, overlap // 100)  # Convert char overlap to sentence overlap
-
-        chunks = []
-        current_chunk_parts = []
-        current_chunk_length = 0
-        chunk_start = 0
-        position = 0
-
-        for para in paragraphs:
-            if len(para) > chunk_size:
-                # Large paragraph: split into sentences
-                sentences = self._split_into_sentences(para)
-
-                for sentence in sentences:
-                    sentence_len = len(sentence)
-
-                    if sentence_len > chunk_size:
-                        # Very long sentence: save current chunk first
-                        if current_chunk_parts:
-                            chunk_text = ' '.join(current_chunk_parts)
-                            chunks.append((chunk_text, chunk_start))
-                            overlap_buffer = current_chunk_parts[-overlap_sentences:] if overlap_sentences > 0 else []
-                            current_chunk_parts = list(overlap_buffer)
-                            current_chunk_length = sum(len(s) + 1 for s in current_chunk_parts)
-
-                        # Add long sentence as its own chunk
-                        chunks.append((sentence, position))
-                        current_chunk_parts = [sentence]
-                        current_chunk_length = len(sentence) + 1
-                        position += sentence_len + 1
-                        continue
-
-                    if current_chunk_length + sentence_len + 1 > chunk_size and current_chunk_parts:
-                        # Current chunk is full, save it
-                        chunk_text = ' '.join(current_chunk_parts)
-                        chunks.append((chunk_text, chunk_start))
-                        overlap_buffer = current_chunk_parts[-overlap_sentences:] if overlap_sentences > 0 else []
-                        current_chunk_parts = list(overlap_buffer)
-                        current_chunk_length = sum(len(s) + 1 for s in current_chunk_parts)
-                        chunk_start = position - current_chunk_length
-
-                    current_chunk_parts.append(sentence)
-                    current_chunk_length += sentence_len + 1
-                    position += sentence_len + 1
-            else:
-                # Small paragraph: try to keep together
-                para_len = len(para)
-                if current_chunk_length + para_len + 2 > chunk_size and current_chunk_parts:
-                    chunk_text = ' '.join(current_chunk_parts)
-                    chunks.append((chunk_text, chunk_start))
-                    last_para_sentences = self._split_into_sentences(current_chunk_parts[-1] if current_chunk_parts else "")
-                    overlap_buffer = last_para_sentences[-overlap_sentences:] if overlap_sentences > 0 and last_para_sentences else []
-                    current_chunk_parts = list(overlap_buffer)
-                    current_chunk_length = sum(len(s) + 1 for s in current_chunk_parts)
-                    chunk_start = position - current_chunk_length
-
-                if current_chunk_parts:
-                    current_chunk_parts.append(para)
-                    current_chunk_length += para_len + 2
-                else:
-                    current_chunk_parts = [para]
-                    current_chunk_length = para_len
-                    chunk_start = position
-
-                position += para_len + 2
-
-        # Don't forget the last chunk
-        if current_chunk_parts:
-            chunk_text = ' '.join(current_chunk_parts)
-            chunks.append((chunk_text, chunk_start))
-
-        # Clean up whitespace
-        return [(re.sub(r'\s+', ' ', c).strip(), pos) for c, pos in chunks if c.strip()]
-
-    def _extract_article_info(self, text: str) -> Optional[Dict]:
-        """Extract article number and paragraph from text."""
-        # Pattern for "Artikel X" or "Art. X"
-        article_match = re.search(r'(?:Artikel|Art\.?)\s+(\d+)', text)
-        paragraph_match = re.search(r'(?:Absatz|Abs\.?)\s+(\d+)', text)
-
-        if article_match:
-            return {
-                "article": article_match.group(1),
-                "paragraph": paragraph_match.group(1) if paragraph_match else None,
-            }
-        return None
+    def _html_to_text(self, html_content: str) -> str:
+        return html_to_text(html_content)
 
     async def _fetch_document_text(self, regulation: Regulation) -> Optional[str]:
         """
@@ -880,7 +157,6 @@ class LegalCorpusIngestion:
         if local_pdf.exists():
             logger.info(f"Extracting text from PDF: {local_pdf}")
             try:
-                # Use embedding service for PDF extraction
                 response = await self.http_client.post(
                     f"{EMBEDDING_SERVICE_URL}/extract-pdf",
                     files={"file": open(local_pdf, "rb")},
@@ -892,7 +168,7 @@ class LegalCorpusIngestion:
             except Exception as e:
                 logger.error(f"PDF extraction failed for {regulation.code}: {e}")
 
-        # Try EUR-Lex CELEX URL if available (bypasses JavaScript CAPTCHA)
+        # Try EUR-Lex CELEX URL if available
         if regulation.celex:
             celex_url = f"https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{regulation.celex}"
             logger.info(f"Fetching {regulation.code} from EUR-Lex CELEX: {celex_url}")
@@ -911,7 +187,6 @@ class LegalCorpusIngestion:
 
                 html_content = response.text
 
-                # Check if we got actual content, not a CAPTCHA page
                 if "verify that you're not a robot" not in html_content and len(html_content) > 10000:
                     text = self._html_to_text(html_content)
                     if text and len(text) > 1000:
@@ -927,7 +202,6 @@ class LegalCorpusIngestion:
         # Fallback to original source URL
         logger.info(f"Fetching {regulation.code} from: {regulation.source_url}")
         try:
-            # Check if source URL is a PDF (handle URLs with query parameters)
             parsed_url = urlparse(regulation.source_url)
             is_pdf_url = parsed_url.path.lower().endswith('.pdf')
             if is_pdf_url:
@@ -943,7 +217,6 @@ class LegalCorpusIngestion:
                 )
                 response.raise_for_status()
 
-                # Extract text from PDF via embedding service
                 pdf_content = response.content
                 extract_response = await self.http_client.post(
                     f"{EMBEDDING_SERVICE_URL}/extract-pdf",
@@ -960,7 +233,6 @@ class LegalCorpusIngestion:
                     logger.warning(f"PDF extraction returned empty text for {regulation.code}")
                     return None
             else:
-                # Regular HTML fetch
                 response = await self.http_client.get(
                     regulation.source_url,
                     follow_redirects=True,
@@ -979,55 +251,21 @@ class LegalCorpusIngestion:
             logger.error(f"Failed to fetch {regulation.code}: {e}")
             return None
 
-    def _html_to_text(self, html_content: str) -> str:
-        """Convert HTML to clean text."""
-        # Remove script and style tags
-        html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
-        html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
-        # Remove comments
-        html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
-        # Replace common HTML entities
-        html_content = html_content.replace('&nbsp;', ' ')
-        html_content = html_content.replace('&amp;', '&')
-        html_content = html_content.replace('&lt;', '<')
-        html_content = html_content.replace('&gt;', '>')
-        html_content = html_content.replace('&quot;', '"')
-        # Convert breaks and paragraphs to newlines for better chunking
-        html_content = re.sub(r'<br\s*/?>', '\n', html_content, flags=re.IGNORECASE)
-        html_content = re.sub(r'</p>', '\n\n', html_content, flags=re.IGNORECASE)
-        html_content = re.sub(r'</div>', '\n', html_content, flags=re.IGNORECASE)
-        html_content = re.sub(r'</h[1-6]>', '\n\n', html_content, flags=re.IGNORECASE)
-        # Remove remaining HTML tags
-        text = re.sub(r'<[^>]+>', ' ', html_content)
-        # Clean up whitespace (but preserve paragraph breaks)
-        text = re.sub(r'[ \t]+', ' ', text)
-        text = re.sub(r'\n[ \t]+', '\n', text)
-        text = re.sub(r'[ \t]+\n', '\n', text)
-        text = re.sub(r'\n{3,}', '\n\n', text)
-        return text.strip()
-
     async def ingest_regulation(self, regulation: Regulation) -> int:
-        """
-        Ingest a single regulation into Qdrant.
-
-        Returns number of chunks indexed.
-        """
+        """Ingest a single regulation into Qdrant. Returns number of chunks indexed."""
         logger.info(f"Ingesting {regulation.code}: {regulation.name}")
 
-        # Fetch document text
         text = await self._fetch_document_text(regulation)
         if not text or len(text) < 100:
             logger.warning(f"No text found for {regulation.code}, skipping")
             return 0
 
-        # Chunk the text
         chunks = self._chunk_text_semantic(text)
         logger.info(f"Created {len(chunks)} chunks for {regulation.code}")
 
         if not chunks:
             return 0
 
-        # Generate embeddings in batches (very small for CPU stability)
         batch_size = 4
         all_points = []
         max_retries = 3
@@ -1036,7 +274,6 @@ class LegalCorpusIngestion:
             batch_chunks = chunks[i:i + batch_size]
             chunk_texts = [c[0] for c in batch_chunks]
 
-            # Retry logic for embedding service stability
             embeddings = None
             for retry in range(max_retries):
                 try:
@@ -1045,21 +282,19 @@ class LegalCorpusIngestion:
                 except Exception as e:
                     logger.warning(f"Embedding attempt {retry+1}/{max_retries} failed for batch {i//batch_size}: {e}")
                     if retry < max_retries - 1:
-                        await asyncio.sleep(3 * (retry + 1))  # Longer backoff: 3s, 6s, 9s
+                        await asyncio.sleep(3 * (retry + 1))
                     else:
                         logger.error(f"Embedding failed permanently for batch {i//batch_size}")
 
             if embeddings is None:
                 continue
 
-            # Longer delay between batches for CPU stability
             await asyncio.sleep(1.5)
 
             for j, ((chunk_text, position), embedding) in enumerate(zip(batch_chunks, embeddings)):
                 chunk_idx = i + j
                 point_id = hashlib.md5(f"{regulation.code}-{chunk_idx}".encode()).hexdigest()
 
-                # Extract article info if present
                 article_info = self._extract_article_info(chunk_text)
 
                 point = PointStruct(
@@ -1078,12 +313,11 @@ class LegalCorpusIngestion:
                         "paragraph": article_info.get("paragraph") if article_info else None,
                         "language": regulation.language,
                         "indexed_at": datetime.utcnow().isoformat(),
-                        "training_allowed": False,  # Legal texts - no training
+                        "training_allowed": False,
                     },
                 )
                 all_points.append(point)
 
-        # Upsert to Qdrant
         if all_points:
             self.qdrant.upsert(
                 collection_name=LEGAL_CORPUS_COLLECTION,
@@ -1135,7 +369,6 @@ class LegalCorpusIngestion:
         try:
             collection_info = self.qdrant.get_collection(LEGAL_CORPUS_COLLECTION)
 
-            # Count points per regulation
             regulation_counts = {}
             for reg in REGULATIONS:
                 result = self.qdrant.count(
@@ -1171,22 +404,10 @@ class LegalCorpusIngestion:
         regulation_codes: Optional[List[str]] = None,
         top_k: int = 5,
     ) -> List[Dict]:
-        """
-        Search the legal corpus for relevant passages.
-
-        Args:
-            query: Search query text
-            regulation_codes: Optional list of regulation codes to filter
-            top_k: Number of results to return
-
-        Returns:
-            List of search results with text and metadata
-        """
-        # Generate query embedding
+        """Search the legal corpus for relevant passages."""
         embeddings = await self._generate_embeddings([query])
         query_vector = embeddings[0]
 
-        # Build filter
         search_filter = None
         if regulation_codes:
             search_filter = Filter(
@@ -1199,7 +420,6 @@ class LegalCorpusIngestion:
                 ]
             )
 
-        # Search
         results = self.qdrant.search(
             collection_name=LEGAL_CORPUS_COLLECTION,
             query_vector=query_vector,
diff --git a/klausur-service/backend/legal_corpus_registry.py b/klausur-service/backend/legal_corpus_registry.py
new file mode 100644
index 0000000..b48e54c
--- /dev/null
+++ b/klausur-service/backend/legal_corpus_registry.py
@@ -0,0 +1,608 @@
+"""
+Legal Corpus Registry — Regulation metadata and definitions.
+
+Pure data module: contains the Regulation dataclass and the REGULATIONS list
+with all EU regulations, DACH national laws, and EDPB guidelines.
+"""
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+
+@dataclass
+class Regulation:
+    """Regulation metadata."""
+    code: str
+    name: str
+    full_name: str
+    regulation_type: str
+    source_url: str
+    description: str
+    celex: Optional[str] = None  # CELEX number for EUR-Lex direct access
+    local_path: Optional[str] = None
+    language: str = "de"
+    requirement_count: int = 0
+
+
+# All regulations from Compliance Hub (EU + DACH national laws + guidelines)
+REGULATIONS: List[Regulation] = [
+    Regulation(
+        code="GDPR",
+        name="DSGVO",
+        full_name="Verordnung (EU) 2016/679 - Datenschutz-Grundverordnung",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2016/679/oj/deu",
+        description="Grundverordnung zum Schutz natuerlicher Personen bei der Verarbeitung personenbezogener Daten.",
+        celex="32016R0679",
+        requirement_count=99,
+    ),
+    Regulation(
+        code="EPRIVACY",
+        name="ePrivacy-Richtlinie",
+        full_name="Richtlinie 2002/58/EG",
+        regulation_type="eu_directive",
+        source_url="https://eur-lex.europa.eu/eli/dir/2002/58/oj/deu",
+        description="Datenschutz in der elektronischen Kommunikation, Cookies und Tracking.",
+        celex="32002L0058",
+        requirement_count=25,
+    ),
+    Regulation(
+        code="TDDDG",
+        name="TDDDG",
+        full_name="Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/ttdsg/TDDDG.pdf",
+        description="Deutsche Umsetzung der ePrivacy-Richtlinie (30 Paragraphen).",
+        requirement_count=30,
+    ),
+    Regulation(
+        code="SCC",
+        name="Standardvertragsklauseln",
+        full_name="Durchfuehrungsbeschluss (EU) 2021/914",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/dec_impl/2021/914/oj/deu",
+        description="Standardvertragsklauseln fuer Drittlandtransfers.",
+        celex="32021D0914",
+        requirement_count=18,
+    ),
+    Regulation(
+        code="DPF",
+        name="EU-US Data Privacy Framework",
+        full_name="Durchfuehrungsbeschluss (EU) 2023/1795",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/dec_impl/2023/1795/oj",
+        description="Angemessenheitsbeschluss fuer USA-Transfers.",
+        celex="32023D1795",
+        requirement_count=12,
+    ),
+    Regulation(
+        code="AIACT",
+        name="EU AI Act",
+        full_name="Verordnung (EU) 2024/1689 - KI-Verordnung",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2024/1689/oj/deu",
+        description="EU-Verordnung zur Regulierung von KI-Systemen nach Risikostufen.",
+        celex="32024R1689",
+        requirement_count=85,
+    ),
+    Regulation(
+        code="CRA",
+        name="Cyber Resilience Act",
+        full_name="Verordnung (EU) 2024/2847",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2024/2847/oj/deu",
+        description="Cybersicherheitsanforderungen, SBOM-Pflicht.",
+        celex="32024R2847",
+        requirement_count=45,
+    ),
+    Regulation(
+        code="NIS2",
+        name="NIS2-Richtlinie",
+        full_name="Richtlinie (EU) 2022/2555",
+        regulation_type="eu_directive",
+        source_url="https://eur-lex.europa.eu/eli/dir/2022/2555/oj/deu",
+        description="Cybersicherheit fuer wesentliche Einrichtungen.",
+        celex="32022L2555",
+        requirement_count=46,
+    ),
+    Regulation(
+        code="EUCSA",
+        name="EU Cybersecurity Act",
+        full_name="Verordnung (EU) 2019/881",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2019/881/oj/deu",
+        description="ENISA und Cybersicherheitszertifizierung.",
+        celex="32019R0881",
+        requirement_count=35,
+    ),
+    Regulation(
+        code="DATAACT",
+        name="Data Act",
+        full_name="Verordnung (EU) 2023/2854",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2023/2854/oj/deu",
+        description="Fairer Datenzugang, IoT-Daten, Cloud-Wechsel.",
+        celex="32023R2854",
+        requirement_count=42,
+    ),
+    Regulation(
+        code="DGA",
+        name="Data Governance Act",
+        full_name="Verordnung (EU) 2022/868",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2022/868/oj/deu",
+        description="Weiterverwendung oeffentlicher Daten.",
+        celex="32022R0868",
+        requirement_count=35,
+    ),
+    Regulation(
+        code="DSA",
+        name="Digital Services Act",
+        full_name="Verordnung (EU) 2022/2065",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2022/2065/oj/deu",
+        description="Digitale Dienste, Transparenzpflichten.",
+        celex="32022R2065",
+        requirement_count=93,
+    ),
+    Regulation(
+        code="EAA",
+        name="European Accessibility Act",
+        full_name="Richtlinie (EU) 2019/882",
+        regulation_type="eu_directive",
+        source_url="https://eur-lex.europa.eu/eli/dir/2019/882/oj/deu",
+        description="Barrierefreiheit digitaler Produkte.",
+        celex="32019L0882",
+        requirement_count=25,
+    ),
+    Regulation(
+        code="DSM",
+        name="DSM-Urheberrechtsrichtlinie",
+        full_name="Richtlinie (EU) 2019/790",
+        regulation_type="eu_directive",
+        source_url="https://eur-lex.europa.eu/eli/dir/2019/790/oj/deu",
+        description="Urheberrecht, Text- und Data-Mining.",
+        celex="32019L0790",
+        requirement_count=22,
+    ),
+    Regulation(
+        code="PLD",
+        name="Produkthaftungsrichtlinie",
+        full_name="Richtlinie (EU) 2024/2853",
+        regulation_type="eu_directive",
+        source_url="https://eur-lex.europa.eu/eli/dir/2024/2853/oj/deu",
+        description="Produkthaftung inkl. Software und KI.",
+        celex="32024L2853",
+        requirement_count=18,
+    ),
+    Regulation(
+        code="GPSR",
+        name="General Product Safety",
+        full_name="Verordnung (EU) 2023/988",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2023/988/oj/deu",
+        description="Allgemeine Produktsicherheit.",
+        celex="32023R0988",
+        requirement_count=30,
+    ),
+    Regulation(
+        code="BSI-TR-03161-1",
+        name="BSI-TR-03161 Teil 1",
+        full_name="BSI Technische Richtlinie - Allgemeine Anforderungen",
+        regulation_type="bsi_standard",
+        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-1.pdf?__blob=publicationFile&v=6",
+        description="Allgemeine Sicherheitsanforderungen (45 Pruefaspekte).",
+        requirement_count=45,
+    ),
+    Regulation(
+        code="BSI-TR-03161-2",
+        name="BSI-TR-03161 Teil 2",
+        full_name="BSI Technische Richtlinie - Web-Anwendungen",
+        regulation_type="bsi_standard",
+        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-2.pdf?__blob=publicationFile&v=5",
+        description="Web-Sicherheit (40 Pruefaspekte).",
+        requirement_count=40,
+    ),
+    Regulation(
+        code="BSI-TR-03161-3",
+        name="BSI-TR-03161 Teil 3",
+        full_name="BSI Technische Richtlinie - Hintergrundsysteme",
+        regulation_type="bsi_standard",
+        source_url="https://www.bsi.bund.de/SharedDocs/Downloads/DE/BSI/Publikationen/TechnischeRichtlinien/TR03161/BSI-TR-03161-3.pdf?__blob=publicationFile&v=5",
+        description="Backend-Sicherheit (35 Pruefaspekte).",
+        requirement_count=35,
+    ),
+    # Additional regulations for financial sector and health
+    Regulation(
+        code="DORA",
+        name="DORA",
+        full_name="Verordnung (EU) 2022/2554 - Digital Operational Resilience Act",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2022/2554/oj/deu",
+        description="Digitale operationale Resilienz fuer den Finanzsektor. IKT-Risikomanagement, Vorfallmeldung, Resilienz-Tests.",
+        celex="32022R2554",
+        requirement_count=64,
+    ),
+    Regulation(
+        code="PSD2",
+        name="PSD2",
+        full_name="Richtlinie (EU) 2015/2366 - Zahlungsdiensterichtlinie",
+        regulation_type="eu_directive",
+        source_url="https://eur-lex.europa.eu/eli/dir/2015/2366/oj/deu",
+        description="Zahlungsdienste im Binnenmarkt. Starke Kundenauthentifizierung, Open Banking APIs.",
+        celex="32015L2366",
+        requirement_count=117,
+    ),
+    Regulation(
+        code="AMLR",
+        name="AML-Verordnung",
+        full_name="Verordnung (EU) 2024/1624 - Geldwaeschebekaempfung",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2024/1624/oj/deu",
+        description="Verhinderung der Nutzung des Finanzsystems zur Geldwaesche und Terrorismusfinanzierung.",
+        celex="32024R1624",
+        requirement_count=89,
+    ),
+    Regulation(
+        code="EHDS",
+        name="EHDS",
+        full_name="Verordnung (EU) 2025/327 - Europaeischer Gesundheitsdatenraum",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2025/327/oj/deu",
+        description="Europaeischer Raum fuer Gesundheitsdaten. Primaer- und Sekundaernutzung von Gesundheitsdaten.",
+        celex="32025R0327",
+        requirement_count=95,
+    ),
+    Regulation(
+        code="MiCA",
+        name="MiCA",
+        full_name="Verordnung (EU) 2023/1114 - Markets in Crypto-Assets",
+        regulation_type="eu_regulation",
+        source_url="https://eur-lex.europa.eu/eli/reg/2023/1114/oj/deu",
+        description="Regulierung von Kryptowerten, Stablecoins und Crypto-Asset-Dienstleistern.",
+        celex="32023R1114",
+        requirement_count=149,
+    ),
+    # =====================================================================
+    # DACH National Laws — Deutschland (P1)
+    # =====================================================================
+    Regulation(
+        code="DE_DDG",
+        name="Digitale-Dienste-Gesetz",
+        full_name="Digitale-Dienste-Gesetz (DDG)",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/ddg/",
+        description="Deutsches Umsetzungsgesetz zum DSA. Regelt Impressumspflicht (§5), Informationspflichten fuer digitale Dienste und Cookies.",
+        requirement_count=30,
+    ),
+    Regulation(
+        code="DE_BGB_AGB",
+        name="BGB AGB-Recht",
+        full_name="BGB §§305-310, 312-312k — AGB und Fernabsatz",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/bgb/",
+        description="Deutsches AGB-Recht (§§305-310 BGB) und Fernabsatzrecht (§§312-312k BGB). Klauselverbote, Inhaltskontrolle, Widerrufsrecht, Button-Loesung.",
+        local_path="DE_BGB_AGB.txt",
+        requirement_count=40,
+    ),
+    Regulation(
+        code="DE_EGBGB",
+        name="EGBGB Art. 246-248",
+        full_name="Einfuehrungsgesetz zum BGB — Informationspflichten",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/bgbeg/",
+        description="Informationspflichten bei Verbrauchervertraegen (Art. 246), Fernabsatz (Art. 246a), E-Commerce (Art. 246c).",
+        local_path="DE_EGBGB.txt",
+        requirement_count=20,
+    ),
+    Regulation(
+        code="DE_UWG",
+        name="UWG Deutschland",
+        full_name="Gesetz gegen den unlauteren Wettbewerb (UWG)",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/uwg_2004/",
+        description="Unlauterer Wettbewerb: irrefuehrende Werbung, Spam-Verbot, Preisangaben, Online-Marketing-Regeln.",
+        requirement_count=25,
+    ),
+    Regulation(
+        code="DE_HGB_RET",
+        name="HGB Aufbewahrung",
+        full_name="HGB §§238-261, 257 — Handelsbuecher und Aufbewahrungsfristen",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/hgb/",
+        description="Buchfuehrungspflicht, Aufbewahrungsfristen 6/10 Jahre, Anforderungen an elektronische Aufbewahrung.",
+        local_path="DE_HGB_RET.txt",
+        requirement_count=15,
+    ),
+    Regulation(
+        code="DE_AO_RET",
+        name="AO Aufbewahrung",
+        full_name="Abgabenordnung §§140-148 — Steuerliche Aufbewahrungspflichten",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/ao_1977/",
+        description="Steuerliche Buchfuehrungs- und Aufbewahrungspflichten. 6/10 Jahre Fristen, Datenzugriff durch Finanzbehoerden.",
+        local_path="DE_AO_RET.txt",
+        requirement_count=12,
+    ),
+    Regulation(
+        code="DE_TKG",
+        name="TKG 2021",
+        full_name="Telekommunikationsgesetz 2021",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/tkg_2021/",
+        description="Telekommunikationsregulierung: Kundenschutz, Datenschutz, Vertragslaufzeiten, Netzinfrastruktur.",
+        requirement_count=45,
+    ),
+    # =====================================================================
+    # DACH National Laws — Oesterreich (P1)
+    # =====================================================================
+    Regulation(
+        code="AT_ECG",
+        name="E-Commerce-Gesetz AT",
+        full_name="E-Commerce-Gesetz (ECG) Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20001703",
+        description="Oesterreichisches E-Commerce-Gesetz: Impressum/Offenlegungspflicht (§5), Informationspflichten, Haftung von Diensteanbietern.",
+        language="de",
+        requirement_count=30,
+    ),
+    Regulation(
+        code="AT_TKG",
+        name="TKG 2021 AT",
+        full_name="Telekommunikationsgesetz 2021 Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20011678",
+        description="Oesterreichisches TKG: Cookie-Bestimmungen (§165), Kommunikationsgeheimnis, Endgeraetezugriff.",
+        language="de",
+        requirement_count=40,
+    ),
+    Regulation(
+        code="AT_KSCHG",
+        name="KSchG Oesterreich",
+        full_name="Konsumentenschutzgesetz (KSchG) Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002462",
+        description="Konsumentenschutz: AGB-Kontrolle (§6 Klauselverbote, §9 Verbandsklage), Ruecktrittsrecht, Informationspflichten.",
+        language="de",
+        requirement_count=35,
+    ),
+    Regulation(
+        code="AT_FAGG",
+        name="FAGG Oesterreich",
+        full_name="Fern- und Auswaertsgeschaefte-Gesetz (FAGG) Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=20008847",
+        description="Fernabsatzrecht: Informationspflichten, Widerrufsrecht 14 Tage, Button-Loesung, Ausnahmen.",
+        language="de",
+        requirement_count=20,
+    ),
+    Regulation(
+        code="AT_UGB_RET",
+        name="UGB Aufbewahrung AT",
+        full_name="UGB §§189-216, 212 — Rechnungslegung und Aufbewahrung Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001702",
+        description="Oesterreichische Rechnungslegungspflicht und Aufbewahrungsfristen (7 Jahre). Buchfuehrung, Jahresabschluss.",
+        local_path="AT_UGB_RET.txt",
+        language="de",
+        requirement_count=15,
+    ),
+    Regulation(
+        code="AT_BAO_RET",
+        name="BAO §132 AT",
+        full_name="Bundesabgabenordnung §132 — Aufbewahrung Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10003940",
+        description="Steuerliche Aufbewahrungspflicht 7 Jahre fuer Buecher, Aufzeichnungen und Belege. Grundstuecke 22 Jahre.",
+        language="de",
+        requirement_count=5,
+    ),
+    Regulation(
+        code="AT_MEDIENG",
+        name="MedienG §§24-25 AT",
+        full_name="Mediengesetz §§24-25 Oesterreich — Impressum und Offenlegung",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10000719",
+        description="Impressum/Offenlegungspflicht fuer periodische Medien und Websites in Oesterreich.",
+        language="de",
+        requirement_count=10,
+    ),
+    # =====================================================================
+    # DACH National Laws — Schweiz (P1)
+    # =====================================================================
+    Regulation(
+        code="CH_DSV",
+        name="DSV Schweiz",
+        full_name="Datenschutzverordnung (DSV) Schweiz — SR 235.11",
+        regulation_type="ch_law",
+        source_url="https://www.fedlex.admin.ch/eli/cc/2022/568/de",
+        description="Ausfuehrungsverordnung zum revDSG: Meldepflichten, DSFA-Verfahren, Auslandtransfers, technische Massnahmen.",
+        language="de",
+        requirement_count=30,
+    ),
+    Regulation(
+        code="CH_OR_AGB",
+        name="OR AGB/Aufbewahrung CH",
+        full_name="Obligationenrecht — AGB-Kontrolle und Aufbewahrung Schweiz (SR 220)",
+        regulation_type="ch_law",
+        source_url="https://www.fedlex.admin.ch/eli/cc/27/317_321_377/de",
+        description="Art. 8 OR (AGB-Inhaltskontrolle), Art. 19/20 (Vertragsfreiheit), Art. 957-958f (Buchfuehrung, 10 Jahre Aufbewahrung).",
+        local_path="CH_OR_AGB.txt",
+        language="de",
+        requirement_count=20,
+    ),
+    Regulation(
+        code="CH_UWG",
+        name="UWG Schweiz",
+        full_name="Bundesgesetz gegen den unlauteren Wettbewerb Schweiz (SR 241)",
+        regulation_type="ch_law",
+        source_url="https://www.fedlex.admin.ch/eli/cc/1988/223_223_223/de",
+        description="Lauterkeitsrecht: Impressumspflicht, irrefuehrende Werbung, aggressive Verkaufsmethoden, AGB-Transparenz.",
+        language="de",
+        requirement_count=20,
+    ),
+    Regulation(
+        code="CH_FMG",
+        name="FMG Schweiz",
+        full_name="Fernmeldegesetz Schweiz (SR 784.10)",
+        regulation_type="ch_law",
+        source_url="https://www.fedlex.admin.ch/eli/cc/1997/2187_2187_2187/de",
+        description="Telekommunikationsregulierung: Fernmeldegeheimnis, Cookies/Tracking (Art. 45c), Spam-Verbot, Datenschutz.",
+        language="de",
+        requirement_count=25,
+    ),
+    # =====================================================================
+    # Deutschland P2
+    # =====================================================================
+    Regulation(
+        code="DE_PANGV",
+        name="PAngV",
+        full_name="Preisangabenverordnung (PAngV 2022)",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/pangv_2022/",
+        description="Preisangaben: Gesamtpreis, Grundpreis, Streichpreise (§11), Online-Preisauszeichnung.",
+        requirement_count=15,
+    ),
+    Regulation(
+        code="DE_DLINFOV",
+        name="DL-InfoV",
+        full_name="Dienstleistungs-Informationspflichten-Verordnung",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/dlinfov/",
+        description="Informationspflichten fuer Dienstleister: Identitaet, Kontakt, Berufshaftpflicht, AGB-Zugang.",
+        requirement_count=10,
+    ),
+    Regulation(
+        code="DE_BETRVG",
+        name="BetrVG §87",
+        full_name="Betriebsverfassungsgesetz §87 Abs.1 Nr.6",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/betrvg/",
+        description="Mitbestimmung bei technischer Ueberwachung: Betriebsrat-Beteiligung bei IT-Systemen, die Arbeitnehmerverhalten ueberwachen koennen.",
+        requirement_count=5,
+    ),
+    # =====================================================================
+    # Oesterreich P2
+    # =====================================================================
+    Regulation(
+        code="AT_ABGB_AGB",
+        name="ABGB AGB-Recht AT",
+        full_name="ABGB §§861-879, 864a — AGB-Kontrolle Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10001622",
+        description="Geltungskontrolle (§864a), Sittenwidrigkeitskontrolle (§879 Abs.3), allgemeine Vertragsregeln.",
+        local_path="AT_ABGB_AGB.txt",
+        language="de",
+        requirement_count=10,
+    ),
+    Regulation(
+        code="AT_UWG",
+        name="UWG Oesterreich",
+        full_name="Bundesgesetz gegen den unlauteren Wettbewerb Oesterreich",
+        regulation_type="at_law",
+        source_url="https://www.ris.bka.gv.at/GeltendeFassung.wxe?Abfrage=Bundesnormen&Gesetzesnummer=10002665",
+        description="Lauterkeitsrecht AT: irrefuehrende Geschaeftspraktiken, aggressive Praktiken, Preisauszeichnung.",
+        language="de",
+        requirement_count=15,
+    ),
+    # =====================================================================
+    # Schweiz P2
+    # =====================================================================
+    Regulation(
+        code="CH_GEBUV",
+        name="GeBuV Schweiz",
+        full_name="Geschaeftsbuecher-Verordnung Schweiz (SR 221.431)",
+        regulation_type="ch_law",
+        source_url="https://www.fedlex.admin.ch/eli/cc/2002/468_468_468/de",
+        description="Ausfuehrungsvorschriften zur Buchfuehrung: elektronische Aufbewahrung, Integritaet, Datentraeger.",
+        language="de",
+        requirement_count=10,
+    ),
+    Regulation(
+        code="CH_ZERTES",
+        name="ZertES Schweiz",
+        full_name="Bundesgesetz ueber die elektronische Signatur (SR 943.03)",
+        regulation_type="ch_law",
+        source_url="https://www.fedlex.admin.ch/eli/cc/2016/752/de",
+        description="Elektronische Signatur und Zertifizierung: Qualifizierte Signaturen, Zertifizierungsdiensteanbieter.",
+        language="de",
+        requirement_count=10,
+    ),
+    # =====================================================================
+    # Deutschland P3
+    # =====================================================================
+    Regulation(
+        code="DE_GESCHGEHG",
+        name="GeschGehG",
+        full_name="Gesetz zum Schutz von Geschaeftsgeheimnissen",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/geschgehg/",
+        description="Schutz von Geschaeftsgeheimnissen: Definition, angemessene Geheimhaltungsmassnahmen, Reverse Engineering.",
+        requirement_count=10,
+    ),
+    Regulation(
+        code="DE_BSIG",
+        name="BSI-Gesetz",
+        full_name="Gesetz ueber das Bundesamt fuer Sicherheit in der Informationstechnik (BSIG)",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/bsig_2009/",
+        description="BSI-Aufgaben, KRITIS-Meldepflichten, IT-Sicherheitsstandards, Zertifizierung.",
+        requirement_count=20,
+    ),
+    Regulation(
+        code="DE_USTG_RET",
+        name="UStG §14b",
+        full_name="Umsatzsteuergesetz §14b — Aufbewahrung von Rechnungen",
+        regulation_type="de_law",
+        source_url="https://www.gesetze-im-internet.de/ustg_1980/",
+        description="Aufbewahrungspflicht fuer Rechnungen: 10 Jahre, Grundstuecke 20 Jahre, elektronische Aufbewahrung.",
+        local_path="DE_USTG_RET.txt",
+        requirement_count=5,
+    ),
+    # =====================================================================
+    # Schweiz P3
+    # =====================================================================
+    Regulation(
+        code="CH_ZGB_PERS",
+        name="ZGB Persoenlichkeitsschutz CH",
+        full_name="Zivilgesetzbuch Art. 28-28l — Persoenlichkeitsschutz Schweiz (SR 210)",
+        regulation_type="ch_law",
+        source_url="https://www.fedlex.admin.ch/eli/cc/24/233_245_233/de",
+        description="Persoenlichkeitsschutz: Recht am eigenen Bild, Schutz der Privatsphaere, Gegendarstellungsrecht.",
+        language="de",
+        requirement_count=8,
+    ),
+    # =====================================================================
+    # 3 fehlgeschlagene Quellen mit alternativen URLs nachholen
+    # =====================================================================
+    Regulation(
+        code="LU_DPA_LAW",
+        name="Datenschutzgesetz Luxemburg",
+        full_name="Loi du 1er aout 2018 — Datenschutzgesetz Luxemburg",
+        regulation_type="national_law",
+        source_url="https://legilux.public.lu/eli/etat/leg/loi/2018/08/01/a686/jo",
+        description="Luxemburgisches Datenschutzgesetz: Organisation der CNPD, nationale DSGVO-Ergaenzung.",
+        language="fr",
+        requirement_count=40,
+    ),
+    Regulation(
+        code="DK_DATABESKYTTELSESLOVEN",
+        name="Databeskyttelsesloven DK",
+        full_name="Databeskyttelsesloven — Datenschutzgesetz Daenemark",
+        regulation_type="national_law",
+        source_url="https://www.retsinformation.dk/eli/lta/2018/502",
+        description="Daenisches Datenschutzgesetz als ergaenzende Bestimmungen zur DSGVO. Reguliert durch Datatilsynet.",
+        language="da",
+        requirement_count=30,
+    ),
+    Regulation(
+        code="EDPB_GUIDELINES_1_2022",
+        name="EDPB GL Bussgelder",
+        full_name="EDPB Leitlinien 04/2022 zur Berechnung von Bussgeldern nach der DSGVO",
+        regulation_type="eu_guideline",
+        source_url="https://www.edpb.europa.eu/system/files/2023-05/edpb_guidelines_042022_calculationofadministrativefines_en.pdf",
+        description="EDPB-Leitlinien zur Berechnung von Verwaltungsbussgeldern unter der DSGVO.",
+        language="en",
+        requirement_count=15,
+    ),
+]
diff --git a/klausur-service/backend/worksheet_editor_ai.py b/klausur-service/backend/worksheet_editor_ai.py
new file mode 100644
index 0000000..a6cb56e
--- /dev/null
+++ b/klausur-service/backend/worksheet_editor_ai.py
@@ -0,0 +1,485 @@
+"""
+Worksheet Editor AI — AI image generation and AI worksheet modification.
+"""
+
+import io
+import json
+import base64
+import logging
+import re
+import time
+import random
+from typing import List, Dict
+
+import httpx
+
+from worksheet_editor_models import (
+    AIImageRequest,
+    AIImageResponse,
+    AIImageStyle,
+    AIModifyRequest,
+    AIModifyResponse,
+    OLLAMA_URL,
+    STYLE_PROMPTS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================
+# AI IMAGE GENERATION
+# =============================================
+
+async def generate_ai_image_logic(request: AIImageRequest) -> AIImageResponse:
+    """
+    Generate an AI image using Ollama with a text-to-image model.
+
+    Falls back to a placeholder if Ollama is not available.
+    """
+    from fastapi import HTTPException
+
+    try:
+        # Build enhanced prompt with style
+        style_modifier = STYLE_PROMPTS.get(request.style, "")
+        enhanced_prompt = f"{request.prompt}, {style_modifier}"
+
+        logger.info(f"Generating AI image: {enhanced_prompt[:100]}...")
+
+        # Check if Ollama is available
+        async with httpx.AsyncClient(timeout=10.0) as check_client:
+            try:
+                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
+                if health_response.status_code != 200:
+                    raise HTTPException(status_code=503, detail="Ollama service not available")
+            except httpx.ConnectError:
+                logger.warning("Ollama not reachable, returning placeholder")
+                return _generate_placeholder_image(request, enhanced_prompt)
+
+        try:
+            async with httpx.AsyncClient(timeout=300.0) as client:
+                tags_response = await client.get(f"{OLLAMA_URL}/api/tags")
+                available_models = [m.get("name", "") for m in tags_response.json().get("models", [])]
+
+                sd_model = None
+                for model in available_models:
+                    if "stable" in model.lower() or "sd" in model.lower() or "diffusion" in model.lower():
+                        sd_model = model
+                        break
+
+                if not sd_model:
+                    logger.warning("No Stable Diffusion model found in Ollama")
+                    return _generate_placeholder_image(request, enhanced_prompt)
+
+                logger.info(f"SD model found: {sd_model}, but image generation API not implemented")
+                return _generate_placeholder_image(request, enhanced_prompt)
+
+        except Exception as e:
+            logger.error(f"Image generation failed: {e}")
+            return _generate_placeholder_image(request, enhanced_prompt)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"AI image generation error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+def _generate_placeholder_image(request: AIImageRequest, prompt: str) -> AIImageResponse:
+    """
+    Generate a placeholder image when AI generation is not available.
+    Creates a simple SVG-based placeholder with the prompt text.
+    """
+    from PIL import Image, ImageDraw, ImageFont
+
+    width, height = request.width, request.height
+
+    style_colors = {
+        AIImageStyle.REALISTIC: ("#2563eb", "#dbeafe"),
+        AIImageStyle.CARTOON: ("#f97316", "#ffedd5"),
+        AIImageStyle.SKETCH: ("#6b7280", "#f3f4f6"),
+        AIImageStyle.CLIPART: ("#8b5cf6", "#ede9fe"),
+        AIImageStyle.EDUCATIONAL: ("#059669", "#d1fae5"),
+    }
+
+    fg_color, bg_color = style_colors.get(request.style, ("#6366f1", "#e0e7ff"))
+
+    img = Image.new('RGB', (width, height), bg_color)
+    draw = ImageDraw.Draw(img)
+
+    draw.rectangle([5, 5, width-6, height-6], outline=fg_color, width=3)
+
+    cx, cy = width // 2, height // 2 - 30
+    draw.ellipse([cx-40, cy-40, cx+40, cy+40], outline=fg_color, width=3)
+    draw.line([cx-20, cy-10, cx+20, cy-10], fill=fg_color, width=3)
+    draw.line([cx, cy-10, cx, cy+20], fill=fg_color, width=3)
+
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
+    except Exception:
+        font = ImageFont.load_default()
+
+    max_chars = 40
+    lines = []
+    words = prompt[:200].split()
+    current_line = ""
+    for word in words:
+        if len(current_line) + len(word) + 1 <= max_chars:
+            current_line += (" " + word if current_line else word)
+        else:
+            if current_line:
+                lines.append(current_line)
+            current_line = word
+    if current_line:
+        lines.append(current_line)
+
+    text_y = cy + 60
+    for line in lines[:4]:
+        bbox = draw.textbbox((0, 0), line, font=font)
+        text_width = bbox[2] - bbox[0]
+        draw.text((cx - text_width // 2, text_y), line, fill=fg_color, font=font)
+        text_y += 20
+
+    badge_text = "KI-Bild (Platzhalter)"
+    try:
+        badge_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
+    except Exception:
+        badge_font = font
+    draw.rectangle([10, height-30, 150, height-10], fill=fg_color)
+    draw.text((15, height-27), badge_text, fill="white", font=badge_font)
+
+    buffer = io.BytesIO()
+    img.save(buffer, format='PNG')
+    buffer.seek(0)
+
+    image_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
+
+    return AIImageResponse(
+        image_base64=image_base64,
+        prompt_used=prompt,
+        error="AI image generation not available. Using placeholder."
+    )
+
+
+# =============================================
+# AI WORKSHEET MODIFICATION
+# =============================================
+
+async def modify_worksheet_with_ai_logic(request: AIModifyRequest) -> AIModifyResponse:
+    """
+    Modify a worksheet using AI based on natural language prompt.
+    """
+    try:
+        logger.info(f"AI modify request: {request.prompt[:100]}...")
+
+        try:
+            canvas_data = json.loads(request.canvas_json)
+        except json.JSONDecodeError:
+            return AIModifyResponse(
+                message="Fehler beim Parsen des Canvas",
+                error="Invalid canvas JSON"
+            )
+
+        system_prompt = """Du bist ein Assistent fuer die Bearbeitung von Arbeitsblaettern.
+Du erhaeltst den aktuellen Zustand eines Canvas im JSON-Format und eine Anweisung des Nutzers.
+Deine Aufgabe ist es, die gewuenschten Aenderungen am Canvas vorzunehmen.
+
+Der Canvas verwendet Fabric.js. Hier sind die wichtigsten Objekttypen:
+- i-text: Interaktiver Text mit fontFamily, fontSize, fill, left, top
+- rect: Rechteck mit left, top, width, height, fill, stroke, strokeWidth
+- circle: Kreis mit left, top, radius, fill, stroke, strokeWidth
+- line: Linie mit x1, y1, x2, y2, stroke, strokeWidth
+
+Das Canvas ist 794x1123 Pixel (A4 bei 96 DPI).
+
+Antworte NUR mit einem JSON-Objekt in diesem Format:
+{
+  "action": "modify" oder "add" oder "delete" oder "info",
+  "objects": [...],  // Neue/modifizierte Objekte (bei modify/add)
+  "message": "Kurze Beschreibung der Aenderung"
+}
+
+Wenn du Objekte hinzufuegst, generiere eindeutige IDs im Format "obj_<timestamp>_<random>".
+"""
+
+        user_prompt = f"""Aktueller Canvas-Zustand:
+```json
+{json.dumps(canvas_data, indent=2)[:5000]}
+```
+
+Nutzer-Anweisung: {request.prompt}
+
+Fuehre die Aenderung durch und antworte mit dem JSON-Objekt."""
+
+        try:
+            async with httpx.AsyncClient(timeout=120.0) as client:
+                response = await client.post(
+                    f"{OLLAMA_URL}/api/generate",
+                    json={
+                        "model": request.model,
+                        "prompt": user_prompt,
+                        "system": system_prompt,
+                        "stream": False,
+                        "options": {
+                            "temperature": 0.3,
+                            "num_predict": 4096
+                        }
+                    }
+                )
+
+                if response.status_code != 200:
+                    logger.warning(f"Ollama error: {response.status_code}, trying local fallback")
+                    return _handle_simple_modification(request.prompt, canvas_data)
+
+                ai_response = response.json().get("response", "")
+
+        except httpx.ConnectError:
+            logger.warning("Ollama not reachable")
+            return _handle_simple_modification(request.prompt, canvas_data)
+        except httpx.TimeoutException:
+            logger.warning("Ollama timeout, trying local fallback")
+            return _handle_simple_modification(request.prompt, canvas_data)
+
+        try:
+            json_start = ai_response.find('{')
+            json_end = ai_response.rfind('}') + 1
+
+            if json_start == -1 or json_end <= json_start:
+                logger.warning(f"No JSON found in AI response: {ai_response[:200]}")
+                return AIModifyResponse(
+                    message="KI konnte die Anfrage nicht verarbeiten",
+                    error="No JSON in response"
+                )
+
+            ai_json = json.loads(ai_response[json_start:json_end])
+            action = ai_json.get("action", "info")
+            message = ai_json.get("message", "Aenderungen angewendet")
+            new_objects = ai_json.get("objects", [])
+
+            if action == "info":
+                return AIModifyResponse(message=message)
+
+            if action == "add" and new_objects:
+                existing_objects = canvas_data.get("objects", [])
+                existing_objects.extend(new_objects)
+                canvas_data["objects"] = existing_objects
+                return AIModifyResponse(
+                    modified_canvas_json=json.dumps(canvas_data),
+                    message=message
+                )
+
+            if action == "modify" and new_objects:
+                existing_objects = canvas_data.get("objects", [])
+                new_ids = {obj.get("id") for obj in new_objects if obj.get("id")}
+                kept_objects = [obj for obj in existing_objects if obj.get("id") not in new_ids]
+                kept_objects.extend(new_objects)
+                canvas_data["objects"] = kept_objects
+                return AIModifyResponse(
+                    modified_canvas_json=json.dumps(canvas_data),
+                    message=message
+                )
+
+            if action == "delete":
+                delete_ids = ai_json.get("delete_ids", [])
+                if delete_ids:
+                    existing_objects = canvas_data.get("objects", [])
+                    canvas_data["objects"] = [obj for obj in existing_objects if obj.get("id") not in delete_ids]
+                    return AIModifyResponse(
+                        modified_canvas_json=json.dumps(canvas_data),
+                        message=message
+                    )
+
+            return AIModifyResponse(message=message)
+
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse AI JSON: {e}")
+            return AIModifyResponse(
+                message="Fehler beim Verarbeiten der KI-Antwort",
+                error=str(e)
+            )
+
+    except Exception as e:
+        logger.error(f"AI modify error: {e}")
+        return AIModifyResponse(
+            message="Ein unerwarteter Fehler ist aufgetreten",
+            error=str(e)
+        )
+
+
+def _handle_simple_modification(prompt: str, canvas_data: dict) -> AIModifyResponse:
+    """
+    Handle simple modifications locally when Ollama is not available.
+    Supports basic commands like adding headings, lines, etc.
+    """
+    prompt_lower = prompt.lower()
+    objects = canvas_data.get("objects", [])
+
+    def generate_id():
+        return f"obj_{int(time.time()*1000)}_{random.randint(1000, 9999)}"
+
+    # Add heading
+    if "ueberschrift" in prompt_lower or "titel" in prompt_lower or "heading" in prompt_lower:
+        text_match = re.search(r'"([^"]+)"', prompt)
+        text = text_match.group(1) if text_match else "Ueberschrift"
+
+        new_text = {
+            "type": "i-text", "id": generate_id(), "text": text,
+            "left": 397, "top": 50, "originX": "center",
+            "fontFamily": "Arial", "fontSize": 28, "fontWeight": "bold", "fill": "#000000"
+        }
+        objects.append(new_text)
+        canvas_data["objects"] = objects
+        return AIModifyResponse(
+            modified_canvas_json=json.dumps(canvas_data),
+            message=f"Ueberschrift '{text}' hinzugefuegt"
+        )
+
+    # Add lines for writing
+    if "linie" in prompt_lower or "line" in prompt_lower or "schreib" in prompt_lower:
+        num_match = re.search(r'(\d+)', prompt)
+        num_lines = int(num_match.group(1)) if num_match else 5
+        num_lines = min(num_lines, 20)
+
+        start_y = 150
+        line_spacing = 40
+
+        for i in range(num_lines):
+            new_line = {
+                "type": "line", "id": generate_id(),
+                "x1": 60, "y1": start_y + i * line_spacing,
+                "x2": 734, "y2": start_y + i * line_spacing,
+                "stroke": "#cccccc", "strokeWidth": 1
+            }
+            objects.append(new_line)
+
+        canvas_data["objects"] = objects
+        return AIModifyResponse(
+            modified_canvas_json=json.dumps(canvas_data),
+            message=f"{num_lines} Schreiblinien hinzugefuegt"
+        )
+
+    # Make text bigger
+    if "groesser" in prompt_lower or "bigger" in prompt_lower or "larger" in prompt_lower:
+        modified = 0
+        for obj in objects:
+            if obj.get("type") in ["i-text", "text", "textbox"]:
+                current_size = obj.get("fontSize", 16)
+                obj["fontSize"] = int(current_size * 1.25)
+                modified += 1
+
+        canvas_data["objects"] = objects
+        if modified > 0:
+            return AIModifyResponse(
+                modified_canvas_json=json.dumps(canvas_data),
+                message=f"{modified} Texte vergroessert"
+            )
+
+    # Center elements
+    if "zentrier" in prompt_lower or "center" in prompt_lower or "mitte" in prompt_lower:
+        center_x = 397
+        for obj in objects:
+            if not obj.get("isGrid"):
+                obj["left"] = center_x
+                obj["originX"] = "center"
+
+        canvas_data["objects"] = objects
+        return AIModifyResponse(
+            modified_canvas_json=json.dumps(canvas_data),
+            message="Elemente zentriert"
+        )
+
+    # Add numbering
+    if "nummer" in prompt_lower or "nummerier" in prompt_lower or "1-10" in prompt_lower:
+        range_match = re.search(r'(\d+)\s*[-bis]+\s*(\d+)', prompt)
+        if range_match:
+            start, end = int(range_match.group(1)), int(range_match.group(2))
+        else:
+            start, end = 1, 10
+
+        y = 100
+        for i in range(start, min(end + 1, start + 20)):
+            new_text = {
+                "type": "i-text", "id": generate_id(), "text": f"{i}.",
+                "left": 40, "top": y, "fontFamily": "Arial", "fontSize": 14, "fill": "#000000"
+            }
+            objects.append(new_text)
+            y += 35
+
+        canvas_data["objects"] = objects
+        return AIModifyResponse(
+            modified_canvas_json=json.dumps(canvas_data),
+            message=f"Nummerierung {start}-{end} hinzugefuegt"
+        )
+
+    # Add rectangle/box
+    if "rechteck" in prompt_lower or "box" in prompt_lower or "kasten" in prompt_lower:
+        new_rect = {
+            "type": "rect", "id": generate_id(),
+            "left": 100, "top": 200, "width": 200, "height": 100,
+            "fill": "transparent", "stroke": "#000000", "strokeWidth": 2
+        }
+        objects.append(new_rect)
+        canvas_data["objects"] = objects
+        return AIModifyResponse(
+            modified_canvas_json=json.dumps(canvas_data),
+            message="Rechteck hinzugefuegt"
+        )
+
+    # Add grid/raster
+    if "raster" in prompt_lower or "grid" in prompt_lower or "tabelle" in prompt_lower:
+        dim_match = re.search(r'(\d+)\s*[x/\u00d7\*mal by]\s*(\d+)', prompt_lower)
+        if dim_match:
+            cols = int(dim_match.group(1))
+            rows = int(dim_match.group(2))
+        else:
+            nums = re.findall(r'(\d+)', prompt)
+            if len(nums) >= 2:
+                cols, rows = int(nums[0]), int(nums[1])
+            else:
+                cols, rows = 3, 4
+
+        cols = min(max(1, cols), 10)
+        rows = min(max(1, rows), 15)
+
+        canvas_width = 794
+        canvas_height = 1123
+        margin = 60
+        available_width = canvas_width - 2 * margin
+        available_height = canvas_height - 2 * margin - 80
+
+        cell_width = available_width / cols
+        cell_height = min(available_height / rows, 80)
+
+        start_x = margin
+        start_y = 120
+
+        grid_objects = []
+        for r in range(rows + 1):
+            y = start_y + r * cell_height
+            grid_objects.append({
+                "type": "line", "id": generate_id(),
+                "x1": start_x, "y1": y,
+                "x2": start_x + cols * cell_width, "y2": y,
+                "stroke": "#666666", "strokeWidth": 1, "isGrid": True
+            })
+
+        for c in range(cols + 1):
+            x = start_x + c * cell_width
+            grid_objects.append({
+                "type": "line", "id": generate_id(),
+                "x1": x, "y1": start_y,
+                "x2": x, "y2": start_y + rows * cell_height,
+                "stroke": "#666666", "strokeWidth": 1, "isGrid": True
+            })
+
+        objects.extend(grid_objects)
+        canvas_data["objects"] = objects
+        return AIModifyResponse(
+            modified_canvas_json=json.dumps(canvas_data),
+            message=f"{cols}x{rows} Raster hinzugefuegt ({cols} Spalten, {rows} Zeilen)"
+        )
+
+    # Default: Ollama needed
+    return AIModifyResponse(
+        message="Diese Aenderung erfordert den KI-Service. Bitte stellen Sie sicher, dass Ollama laeuft.",
+        error="Complex modification requires Ollama"
+    )
diff --git a/klausur-service/backend/worksheet_editor_api.py b/klausur-service/backend/worksheet_editor_api.py
index bc96131..875d78b 100644
--- a/klausur-service/backend/worksheet_editor_api.py
+++ b/klausur-service/backend/worksheet_editor_api.py
@@ -5,122 +5,60 @@ Provides endpoints for:
 - AI Image generation via Ollama/Stable Diffusion
 - Worksheet Save/Load
 - PDF Export
+
+Split modules:
+- worksheet_editor_models: Enums, Pydantic models, configuration
+- worksheet_editor_ai: AI image generation and AI worksheet modification
+- worksheet_editor_reconstruct: Document reconstruction from vocab sessions
 """
 
 import os
 import io
-import uuid
 import json
-import base64
 import logging
 from datetime import datetime, timezone
-from typing import Optional, List, Dict, Any
-from enum import Enum
-from dataclasses import dataclass, field, asdict
+import uuid
 
-from fastapi import APIRouter, HTTPException, Request, BackgroundTasks
-from fastapi.responses import FileResponse, StreamingResponse
-from pydantic import BaseModel, Field
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import StreamingResponse
 import httpx
 
-# PDF Generation
-try:
-    from reportlab.lib import colors
-    from reportlab.lib.pagesizes import A4
-    from reportlab.lib.units import mm
-    from reportlab.pdfgen import canvas
-    from reportlab.lib.styles import getSampleStyleSheet
-    REPORTLAB_AVAILABLE = True
-except ImportError:
-    REPORTLAB_AVAILABLE = False
+# Re-export everything from sub-modules for backward compatibility
+from worksheet_editor_models import (  # noqa: F401
+    AIImageStyle,
+    WorksheetStatus,
+    AIImageRequest,
+    AIImageResponse,
+    PageData,
+    PageFormat,
+    WorksheetSaveRequest,
+    WorksheetResponse,
+    AIModifyRequest,
+    AIModifyResponse,
+    ReconstructRequest,
+    ReconstructResponse,
+    worksheets_db,
+    OLLAMA_URL,
+    SD_MODEL,
+    WORKSHEET_STORAGE_DIR,
+    STYLE_PROMPTS,
+    REPORTLAB_AVAILABLE,
+)
+
+from worksheet_editor_ai import (  # noqa: F401
+    generate_ai_image_logic,
+    _generate_placeholder_image,
+    modify_worksheet_with_ai_logic,
+    _handle_simple_modification,
+)
+
+from worksheet_editor_reconstruct import (  # noqa: F401
+    reconstruct_document_logic,
+    _detect_image_regions,
+)
 
 logger = logging.getLogger(__name__)
 
-# =============================================
-# CONFIGURATION
-# =============================================
-
-OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
-SD_MODEL = os.getenv("SD_MODEL", "stable-diffusion")  # or specific SD model
-WORKSHEET_STORAGE_DIR = os.getenv("WORKSHEET_STORAGE_DIR",
-    os.path.join(os.path.dirname(os.path.abspath(__file__)), "worksheet-storage"))
-
-# Ensure storage directory exists
-os.makedirs(WORKSHEET_STORAGE_DIR, exist_ok=True)
-
-# =============================================
-# ENUMS & MODELS
-# =============================================
-
-class AIImageStyle(str, Enum):
-    REALISTIC = "realistic"
-    CARTOON = "cartoon"
-    SKETCH = "sketch"
-    CLIPART = "clipart"
-    EDUCATIONAL = "educational"
-
-class WorksheetStatus(str, Enum):
-    DRAFT = "draft"
-    PUBLISHED = "published"
-    ARCHIVED = "archived"
-
-# Style prompt modifiers
-STYLE_PROMPTS = {
-    AIImageStyle.REALISTIC: "photorealistic, high detail, professional photography",
-    AIImageStyle.CARTOON: "cartoon style, colorful, child-friendly, simple shapes",
-    AIImageStyle.SKETCH: "pencil sketch, hand-drawn, black and white, artistic",
-    AIImageStyle.CLIPART: "clipart style, flat design, simple, vector-like",
-    AIImageStyle.EDUCATIONAL: "educational illustration, clear, informative, textbook style"
-}
-
-# =============================================
-# REQUEST/RESPONSE MODELS
-# =============================================
-
-class AIImageRequest(BaseModel):
-    prompt: str = Field(..., min_length=3, max_length=500)
-    style: AIImageStyle = AIImageStyle.EDUCATIONAL
-    width: int = Field(512, ge=256, le=1024)
-    height: int = Field(512, ge=256, le=1024)
-
-class AIImageResponse(BaseModel):
-    image_base64: str
-    prompt_used: str
-    error: Optional[str] = None
-
-class PageData(BaseModel):
-    id: str
-    index: int
-    canvasJSON: str
-
-class PageFormat(BaseModel):
-    width: float = 210
-    height: float = 297
-    orientation: str = "portrait"
-    margins: Dict[str, float] = {"top": 15, "right": 15, "bottom": 15, "left": 15}
-
-class WorksheetSaveRequest(BaseModel):
-    id: Optional[str] = None
-    title: str
-    description: Optional[str] = None
-    pages: List[PageData]
-    pageFormat: Optional[PageFormat] = None
-
-class WorksheetResponse(BaseModel):
-    id: str
-    title: str
-    description: Optional[str]
-    pages: List[PageData]
-    pageFormat: PageFormat
-    createdAt: str
-    updatedAt: str
-
-# =============================================
-# IN-MEMORY STORAGE (Development)
-# =============================================
-
-worksheets_db: Dict[str, Dict] = {}
-
 # =============================================
 # ROUTER
 # =============================================
@@ -143,144 +81,7 @@ async def generate_ai_image(request: AIImageRequest):
 
     Falls back to a placeholder if Ollama is not available.
     """
-    try:
-        # Build enhanced prompt with style
-        style_modifier = STYLE_PROMPTS.get(request.style, "")
-        enhanced_prompt = f"{request.prompt}, {style_modifier}"
-
-        logger.info(f"Generating AI image: {enhanced_prompt[:100]}...")
-
-        # Check if Ollama is available
-        async with httpx.AsyncClient(timeout=10.0) as check_client:
-            try:
-                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
-                if health_response.status_code != 200:
-                    raise HTTPException(status_code=503, detail="Ollama service not available")
-            except httpx.ConnectError:
-                logger.warning("Ollama not reachable, returning placeholder")
-                # Return a placeholder image (simple colored rectangle)
-                return _generate_placeholder_image(request, enhanced_prompt)
-
-        # Try to generate with Stable Diffusion via Ollama
-        # Note: Ollama doesn't natively support SD, this is a placeholder for when it does
-        # or when using a compatible endpoint
-
-        try:
-            async with httpx.AsyncClient(timeout=300.0) as client:
-                # Check if SD model is available
-                tags_response = await client.get(f"{OLLAMA_URL}/api/tags")
-                available_models = [m.get("name", "") for m in tags_response.json().get("models", [])]
-
-                # Look for SD-compatible model
-                sd_model = None
-                for model in available_models:
-                    if "stable" in model.lower() or "sd" in model.lower() or "diffusion" in model.lower():
-                        sd_model = model
-                        break
-
-                if not sd_model:
-                    logger.warning("No Stable Diffusion model found in Ollama")
-                    return _generate_placeholder_image(request, enhanced_prompt)
-
-                # Generate image (this would need Ollama's image generation API)
-                # For now, return placeholder
-                logger.info(f"SD model found: {sd_model}, but image generation API not implemented")
-                return _generate_placeholder_image(request, enhanced_prompt)
-
-        except Exception as e:
-            logger.error(f"Image generation failed: {e}")
-            return _generate_placeholder_image(request, enhanced_prompt)
-
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"AI image generation error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-def _generate_placeholder_image(request: AIImageRequest, prompt: str) -> AIImageResponse:
-    """
-    Generate a placeholder image when AI generation is not available.
-    Creates a simple SVG-based placeholder with the prompt text.
-    """
-    from PIL import Image, ImageDraw, ImageFont
-
-    # Create image
-    width, height = request.width, request.height
-
-    # Style-based colors
-    style_colors = {
-        AIImageStyle.REALISTIC: ("#2563eb", "#dbeafe"),
-        AIImageStyle.CARTOON: ("#f97316", "#ffedd5"),
-        AIImageStyle.SKETCH: ("#6b7280", "#f3f4f6"),
-        AIImageStyle.CLIPART: ("#8b5cf6", "#ede9fe"),
-        AIImageStyle.EDUCATIONAL: ("#059669", "#d1fae5"),
-    }
-
-    fg_color, bg_color = style_colors.get(request.style, ("#6366f1", "#e0e7ff"))
-
-    # Create image with Pillow
-    img = Image.new('RGB', (width, height), bg_color)
-    draw = ImageDraw.Draw(img)
-
-    # Draw border
-    draw.rectangle([5, 5, width-6, height-6], outline=fg_color, width=3)
-
-    # Draw icon (simple shapes)
-    cx, cy = width // 2, height // 2 - 30
-    draw.ellipse([cx-40, cy-40, cx+40, cy+40], outline=fg_color, width=3)
-    draw.line([cx-20, cy-10, cx+20, cy-10], fill=fg_color, width=3)
-    draw.line([cx, cy-10, cx, cy+20], fill=fg_color, width=3)
-
-    # Draw text
-    try:
-        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14)
-    except:
-        font = ImageFont.load_default()
-
-    # Wrap text
-    max_chars = 40
-    lines = []
-    words = prompt[:200].split()
-    current_line = ""
-    for word in words:
-        if len(current_line) + len(word) + 1 <= max_chars:
-            current_line += (" " + word if current_line else word)
-        else:
-            if current_line:
-                lines.append(current_line)
-            current_line = word
-    if current_line:
-        lines.append(current_line)
-
-    text_y = cy + 60
-    for line in lines[:4]:  # Max 4 lines
-        bbox = draw.textbbox((0, 0), line, font=font)
-        text_width = bbox[2] - bbox[0]
-        draw.text((cx - text_width // 2, text_y), line, fill=fg_color, font=font)
-        text_y += 20
-
-    # Draw "AI Placeholder" badge
-    badge_text = "KI-Bild (Platzhalter)"
-    try:
-        badge_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
-    except:
-        badge_font = font
-    draw.rectangle([10, height-30, 150, height-10], fill=fg_color)
-    draw.text((15, height-27), badge_text, fill="white", font=badge_font)
-
-    # Convert to base64
-    buffer = io.BytesIO()
-    img.save(buffer, format='PNG')
-    buffer.seek(0)
-
-    image_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
-
-    return AIImageResponse(
-        image_base64=image_base64,
-        prompt_used=prompt,
-        error="AI image generation not available. Using placeholder."
-    )
+    return await generate_ai_image_logic(request)
 
 
 # =============================================
@@ -298,10 +99,8 @@ async def save_worksheet(request: WorksheetSaveRequest):
     try:
         now = datetime.now(timezone.utc).isoformat()
 
-        # Generate or use existing ID
         worksheet_id = request.id or f"ws_{uuid.uuid4().hex[:12]}"
 
-        # Build worksheet data
         worksheet = {
             "id": worksheet_id,
             "title": request.title,
@@ -312,10 +111,8 @@ async def save_worksheet(request: WorksheetSaveRequest):
             "updatedAt": now
         }
 
-        # Save to in-memory storage
         worksheets_db[worksheet_id] = worksheet
 
-        # Also persist to file
         filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
         with open(filepath, 'w', encoding='utf-8') as f:
             json.dump(worksheet, f, ensure_ascii=False, indent=2)
@@ -331,20 +128,16 @@ async def save_worksheet(request: WorksheetSaveRequest):
 
 @router.get("/{worksheet_id}", response_model=WorksheetResponse)
 async def get_worksheet(worksheet_id: str):
-    """
-    Load a worksheet document by ID.
-    """
+    """Load a worksheet document by ID."""
     try:
-        # Try in-memory first
         if worksheet_id in worksheets_db:
             return WorksheetResponse(**worksheets_db[worksheet_id])
 
-        # Try file storage
         filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
         if os.path.exists(filepath):
             with open(filepath, 'r', encoding='utf-8') as f:
                 worksheet = json.load(f)
-                worksheets_db[worksheet_id] = worksheet  # Cache it
+                worksheets_db[worksheet_id] = worksheet
                 return WorksheetResponse(**worksheet)
 
         raise HTTPException(status_code=404, detail="Worksheet not found")
@@ -358,13 +151,10 @@ async def get_worksheet(worksheet_id: str):
 
 @router.get("/list/all")
 async def list_worksheets():
-    """
-    List all available worksheets.
-    """
+    """List all available worksheets."""
     try:
         worksheets = []
 
-        # Load from file storage
         for filename in os.listdir(WORKSHEET_STORAGE_DIR):
             if filename.endswith('.json'):
                 filepath = os.path.join(WORKSHEET_STORAGE_DIR, filename)
@@ -382,7 +172,6 @@ async def list_worksheets():
                 except Exception as e:
                     logger.warning(f"Failed to load {filename}: {e}")
 
-        # Sort by updatedAt descending
         worksheets.sort(key=lambda x: x.get("updatedAt", ""), reverse=True)
 
         return {"worksheets": worksheets, "total": len(worksheets)}
@@ -394,15 +183,11 @@ async def list_worksheets():
 
 @router.delete("/{worksheet_id}")
 async def delete_worksheet(worksheet_id: str):
-    """
-    Delete a worksheet document.
-    """
+    """Delete a worksheet document."""
     try:
-        # Remove from memory
         if worksheet_id in worksheets_db:
             del worksheets_db[worksheet_id]
 
-        # Remove file
         filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
         if os.path.exists(filepath):
             os.remove(filepath)
@@ -434,7 +219,9 @@ async def export_worksheet_pdf(worksheet_id: str):
         raise HTTPException(status_code=501, detail="PDF export not available (reportlab not installed)")
 
     try:
-        # Load worksheet
+        from reportlab.lib.pagesizes import A4
+        from reportlab.pdfgen import canvas
+
         worksheet = worksheets_db.get(worksheet_id)
         if not worksheet:
             filepath = os.path.join(WORKSHEET_STORAGE_DIR, f"{worksheet_id}.json")
@@ -444,21 +231,18 @@ async def export_worksheet_pdf(worksheet_id: str):
             else:
                 raise HTTPException(status_code=404, detail="Worksheet not found")
 
-        # Create PDF
         buffer = io.BytesIO()
         c = canvas.Canvas(buffer, pagesize=A4)
 
         page_width, page_height = A4
 
         for page_data in worksheet.get("pages", []):
-            # Add title on first page
             if page_data.get("index", 0) == 0:
                 c.setFont("Helvetica-Bold", 18)
                 c.drawString(50, page_height - 50, worksheet.get("title", "Arbeitsblatt"))
                 c.setFont("Helvetica", 10)
                 c.drawString(50, page_height - 70, f"Erstellt: {worksheet.get('createdAt', '')[:10]}")
 
-            # Parse canvas JSON and render basic elements
             canvas_json_str = page_data.get("canvasJSON", "{}")
             if canvas_json_str:
                 try:
@@ -469,34 +253,28 @@ async def export_worksheet_pdf(worksheet_id: str):
                         obj_type = obj.get("type", "")
 
                         if obj_type in ["text", "i-text", "textbox"]:
-                            # Render text
                             text = obj.get("text", "")
                             left = obj.get("left", 50)
                             top = obj.get("top", 100)
                             font_size = obj.get("fontSize", 12)
 
-                            # Convert from canvas coords to PDF coords
-                            pdf_x = left * 0.75  # Approximate scale
+                            pdf_x = left * 0.75
                             pdf_y = page_height - (top * 0.75)
 
                             c.setFont("Helvetica", min(font_size, 24))
                             c.drawString(pdf_x, pdf_y, text[:100])
 
                         elif obj_type == "rect":
-                            # Render rectangle
                             left = obj.get("left", 0) * 0.75
                             top = obj.get("top", 0) * 0.75
                             width = obj.get("width", 50) * 0.75
                             height = obj.get("height", 30) * 0.75
-
                             c.rect(left, page_height - top - height, width, height)
 
                         elif obj_type == "circle":
-                            # Render circle
                             left = obj.get("left", 0) * 0.75
                             top = obj.get("top", 0) * 0.75
                             radius = obj.get("radius", 25) * 0.75
-
                             c.circle(left + radius, page_height - top - radius, radius)
 
                 except json.JSONDecodeError:
@@ -526,16 +304,6 @@ async def export_worksheet_pdf(worksheet_id: str):
 # AI WORKSHEET MODIFICATION
 # =============================================
 
-class AIModifyRequest(BaseModel):
-    prompt: str = Field(..., min_length=3, max_length=1000)
-    canvas_json: str
-    model: str = "qwen2.5vl:32b"
-
-class AIModifyResponse(BaseModel):
-    modified_canvas_json: Optional[str] = None
-    message: str
-    error: Optional[str] = None
-
 @router.post("/ai-modify", response_model=AIModifyResponse)
 async def modify_worksheet_with_ai(request: AIModifyRequest):
     """
@@ -544,397 +312,7 @@ async def modify_worksheet_with_ai(request: AIModifyRequest):
     Uses Ollama with qwen2.5vl:32b to understand the canvas state
     and generate modifications based on the user's request.
     """
-    try:
-        logger.info(f"AI modify request: {request.prompt[:100]}...")
-
-        # Parse current canvas state
-        try:
-            canvas_data = json.loads(request.canvas_json)
-        except json.JSONDecodeError:
-            return AIModifyResponse(
-                message="Fehler beim Parsen des Canvas",
-                error="Invalid canvas JSON"
-            )
-
-        # Build system prompt for the AI
-        system_prompt = """Du bist ein Assistent fuer die Bearbeitung von Arbeitsblaettern.
-Du erhaeltst den aktuellen Zustand eines Canvas im JSON-Format und eine Anweisung des Nutzers.
-Deine Aufgabe ist es, die gewuenschten Aenderungen am Canvas vorzunehmen.
-
-Der Canvas verwendet Fabric.js. Hier sind die wichtigsten Objekttypen:
-- i-text: Interaktiver Text mit fontFamily, fontSize, fill, left, top
-- rect: Rechteck mit left, top, width, height, fill, stroke, strokeWidth
-- circle: Kreis mit left, top, radius, fill, stroke, strokeWidth
-- line: Linie mit x1, y1, x2, y2, stroke, strokeWidth
-
-Das Canvas ist 794x1123 Pixel (A4 bei 96 DPI).
-
-Antworte NUR mit einem JSON-Objekt in diesem Format:
-{
-  "action": "modify" oder "add" oder "delete" oder "info",
-  "objects": [...],  // Neue/modifizierte Objekte (bei modify/add)
-  "message": "Kurze Beschreibung der Aenderung"
-}
-
-Wenn du Objekte hinzufuegst, generiere eindeutige IDs im Format "obj_<timestamp>_<random>".
-"""
-
-        user_prompt = f"""Aktueller Canvas-Zustand:
-```json
-{json.dumps(canvas_data, indent=2)[:5000]}
-```
-
-Nutzer-Anweisung: {request.prompt}
-
-Fuehre die Aenderung durch und antworte mit dem JSON-Objekt."""
-
-        # Call Ollama
-        try:
-            async with httpx.AsyncClient(timeout=120.0) as client:
-                response = await client.post(
-                    f"{OLLAMA_URL}/api/generate",
-                    json={
-                        "model": request.model,
-                        "prompt": user_prompt,
-                        "system": system_prompt,
-                        "stream": False,
-                        "options": {
-                            "temperature": 0.3,
-                            "num_predict": 4096
-                        }
-                    }
-                )
-
-                if response.status_code != 200:
-                    logger.warning(f"Ollama error: {response.status_code}, trying local fallback")
-                    # Fallback: Try to handle simple requests locally
-                    return _handle_simple_modification(request.prompt, canvas_data)
-
-                result = response.json()
-                ai_response = result.get("response", "")
-
-        except httpx.ConnectError:
-            logger.warning("Ollama not reachable")
-            # Fallback: Try to handle simple requests locally
-            return _handle_simple_modification(request.prompt, canvas_data)
-        except httpx.TimeoutException:
-            logger.warning("Ollama timeout, trying local fallback")
-            # Fallback: Try to handle simple requests locally
-            return _handle_simple_modification(request.prompt, canvas_data)
-
-        # Parse AI response
-        try:
-            # Find JSON in response
-            json_start = ai_response.find('{')
-            json_end = ai_response.rfind('}') + 1
-
-            if json_start == -1 or json_end <= json_start:
-                logger.warning(f"No JSON found in AI response: {ai_response[:200]}")
-                return AIModifyResponse(
-                    message="KI konnte die Anfrage nicht verarbeiten",
-                    error="No JSON in response"
-                )
-
-            ai_json = json.loads(ai_response[json_start:json_end])
-            action = ai_json.get("action", "info")
-            message = ai_json.get("message", "Aenderungen angewendet")
-            new_objects = ai_json.get("objects", [])
-
-            if action == "info":
-                return AIModifyResponse(message=message)
-
-            if action == "add" and new_objects:
-                # Add new objects to canvas
-                existing_objects = canvas_data.get("objects", [])
-                existing_objects.extend(new_objects)
-                canvas_data["objects"] = existing_objects
-
-                return AIModifyResponse(
-                    modified_canvas_json=json.dumps(canvas_data),
-                    message=message
-                )
-
-            if action == "modify" and new_objects:
-                # Replace matching objects or add new ones
-                existing_objects = canvas_data.get("objects", [])
-                new_ids = {obj.get("id") for obj in new_objects if obj.get("id")}
-
-                # Keep objects that aren't being modified
-                kept_objects = [obj for obj in existing_objects if obj.get("id") not in new_ids]
-                kept_objects.extend(new_objects)
-                canvas_data["objects"] = kept_objects
-
-                return AIModifyResponse(
-                    modified_canvas_json=json.dumps(canvas_data),
-                    message=message
-                )
-
-            if action == "delete":
-                # Delete objects by ID
-                delete_ids = ai_json.get("delete_ids", [])
-                if delete_ids:
-                    existing_objects = canvas_data.get("objects", [])
-                    canvas_data["objects"] = [obj for obj in existing_objects if obj.get("id") not in delete_ids]
-
-                    return AIModifyResponse(
-                        modified_canvas_json=json.dumps(canvas_data),
-                        message=message
-                    )
-
-            return AIModifyResponse(message=message)
-
-        except json.JSONDecodeError as e:
-            logger.error(f"Failed to parse AI JSON: {e}")
-            return AIModifyResponse(
-                message="Fehler beim Verarbeiten der KI-Antwort",
-                error=str(e)
-            )
-
-    except Exception as e:
-        logger.error(f"AI modify error: {e}")
-        return AIModifyResponse(
-            message="Ein unerwarteter Fehler ist aufgetreten",
-            error=str(e)
-        )
-
-
-def _handle_simple_modification(prompt: str, canvas_data: dict) -> AIModifyResponse:
-    """
-    Handle simple modifications locally when Ollama is not available.
-    Supports basic commands like adding headings, lines, etc.
-    """
-    import time
-    import random
-
-    prompt_lower = prompt.lower()
-    objects = canvas_data.get("objects", [])
-
-    def generate_id():
-        return f"obj_{int(time.time()*1000)}_{random.randint(1000, 9999)}"
-
-    # Add heading
-    if "ueberschrift" in prompt_lower or "titel" in prompt_lower or "heading" in prompt_lower:
-        # Extract text if provided in quotes
-        import re
-        text_match = re.search(r'"([^"]+)"', prompt)
-        text = text_match.group(1) if text_match else "Ueberschrift"
-
-        new_text = {
-            "type": "i-text",
-            "id": generate_id(),
-            "text": text,
-            "left": 397,  # Center of A4
-            "top": 50,
-            "originX": "center",
-            "fontFamily": "Arial",
-            "fontSize": 28,
-            "fontWeight": "bold",
-            "fill": "#000000"
-        }
-        objects.append(new_text)
-        canvas_data["objects"] = objects
-
-        return AIModifyResponse(
-            modified_canvas_json=json.dumps(canvas_data),
-            message=f"Ueberschrift '{text}' hinzugefuegt"
-        )
-
-    # Add lines for writing
-    if "linie" in prompt_lower or "line" in prompt_lower or "schreib" in prompt_lower:
-        # Count how many lines
-        import re
-        num_match = re.search(r'(\d+)', prompt)
-        num_lines = int(num_match.group(1)) if num_match else 5
-        num_lines = min(num_lines, 20)  # Max 20 lines
-
-        start_y = 150
-        line_spacing = 40
-
-        for i in range(num_lines):
-            new_line = {
-                "type": "line",
-                "id": generate_id(),
-                "x1": 60,
-                "y1": start_y + i * line_spacing,
-                "x2": 734,
-                "y2": start_y + i * line_spacing,
-                "stroke": "#cccccc",
-                "strokeWidth": 1
-            }
-            objects.append(new_line)
-
-        canvas_data["objects"] = objects
-
-        return AIModifyResponse(
-            modified_canvas_json=json.dumps(canvas_data),
-            message=f"{num_lines} Schreiblinien hinzugefuegt"
-        )
-
-    # Make text bigger
-    if "groesser" in prompt_lower or "bigger" in prompt_lower or "larger" in prompt_lower:
-        modified = 0
-        for obj in objects:
-            if obj.get("type") in ["i-text", "text", "textbox"]:
-                current_size = obj.get("fontSize", 16)
-                obj["fontSize"] = int(current_size * 1.25)
-                modified += 1
-
-        canvas_data["objects"] = objects
-
-        if modified > 0:
-            return AIModifyResponse(
-                modified_canvas_json=json.dumps(canvas_data),
-                message=f"{modified} Texte vergroessert"
-            )
-
-    # Center elements
-    if "zentrier" in prompt_lower or "center" in prompt_lower or "mitte" in prompt_lower:
-        center_x = 397
-        for obj in objects:
-            if not obj.get("isGrid"):
-                obj["left"] = center_x
-                obj["originX"] = "center"
-
-        canvas_data["objects"] = objects
-
-        return AIModifyResponse(
-            modified_canvas_json=json.dumps(canvas_data),
-            message="Elemente zentriert"
-        )
-
-    # Add numbering
-    if "nummer" in prompt_lower or "nummerier" in prompt_lower or "1-10" in prompt_lower:
-        import re
-        range_match = re.search(r'(\d+)\s*[-bis]+\s*(\d+)', prompt)
-        if range_match:
-            start, end = int(range_match.group(1)), int(range_match.group(2))
-        else:
-            start, end = 1, 10
-
-        y = 100
-        for i in range(start, min(end + 1, start + 20)):
-            new_text = {
-                "type": "i-text",
-                "id": generate_id(),
-                "text": f"{i}.",
-                "left": 40,
-                "top": y,
-                "fontFamily": "Arial",
-                "fontSize": 14,
-                "fill": "#000000"
-            }
-            objects.append(new_text)
-            y += 35
-
-        canvas_data["objects"] = objects
-
-        return AIModifyResponse(
-            modified_canvas_json=json.dumps(canvas_data),
-            message=f"Nummerierung {start}-{end} hinzugefuegt"
-        )
-
-    # Add rectangle/box
-    if "rechteck" in prompt_lower or "box" in prompt_lower or "kasten" in prompt_lower:
-        new_rect = {
-            "type": "rect",
-            "id": generate_id(),
-            "left": 100,
-            "top": 200,
-            "width": 200,
-            "height": 100,
-            "fill": "transparent",
-            "stroke": "#000000",
-            "strokeWidth": 2
-        }
-        objects.append(new_rect)
-        canvas_data["objects"] = objects
-
-        return AIModifyResponse(
-            modified_canvas_json=json.dumps(canvas_data),
-            message="Rechteck hinzugefuegt"
-        )
-
-    # Add grid/raster
-    if "raster" in prompt_lower or "grid" in prompt_lower or "tabelle" in prompt_lower:
-        import re
-        # Parse dimensions like "3x4", "3/4", "3 mal 4", "3 by 4"
-        dim_match = re.search(r'(\d+)\s*[x/×\*mal by]\s*(\d+)', prompt_lower)
-        if dim_match:
-            cols = int(dim_match.group(1))
-            rows = int(dim_match.group(2))
-        else:
-            # Try single numbers
-            nums = re.findall(r'(\d+)', prompt)
-            if len(nums) >= 2:
-                cols, rows = int(nums[0]), int(nums[1])
-            else:
-                cols, rows = 3, 4  # Default grid
-
-        # Limit grid size
-        cols = min(max(1, cols), 10)
-        rows = min(max(1, rows), 15)
-
-        # Canvas dimensions (A4 at 96 DPI)
-        canvas_width = 794
-        canvas_height = 1123
-
-        # Grid positioning
-        margin = 60
-        available_width = canvas_width - 2 * margin
-        available_height = canvas_height - 2 * margin - 80  # Leave space for header
-
-        cell_width = available_width / cols
-        cell_height = min(available_height / rows, 80)  # Max cell height
-
-        start_x = margin
-        start_y = 120  # Below potential header
-
-        # Create grid lines
-        grid_objects = []
-
-        # Horizontal lines
-        for r in range(rows + 1):
-            y = start_y + r * cell_height
-            grid_objects.append({
-                "type": "line",
-                "id": generate_id(),
-                "x1": start_x,
-                "y1": y,
-                "x2": start_x + cols * cell_width,
-                "y2": y,
-                "stroke": "#666666",
-                "strokeWidth": 1,
-                "isGrid": True
-            })
-
-        # Vertical lines
-        for c in range(cols + 1):
-            x = start_x + c * cell_width
-            grid_objects.append({
-                "type": "line",
-                "id": generate_id(),
-                "x1": x,
-                "y1": start_y,
-                "x2": x,
-                "y2": start_y + rows * cell_height,
-                "stroke": "#666666",
-                "strokeWidth": 1,
-                "isGrid": True
-            })
-
-        objects.extend(grid_objects)
-        canvas_data["objects"] = objects
-
-        return AIModifyResponse(
-            modified_canvas_json=json.dumps(canvas_data),
-            message=f"{cols}x{rows} Raster hinzugefuegt ({cols} Spalten, {rows} Zeilen)"
-        )
-
-    # Default: Ollama needed
-    return AIModifyResponse(
-        message="Diese Aenderung erfordert den KI-Service. Bitte stellen Sie sicher, dass Ollama laeuft.",
-        error="Complex modification requires Ollama"
-    )
+    return await modify_worksheet_with_ai_logic(request)
 
 
 # =============================================
@@ -943,9 +321,7 @@ def _handle_simple_modification(prompt: str, canvas_data: dict) -> AIModifyRespo
 
 @router.get("/health/check")
 async def health_check():
-    """
-    Check worksheet editor API health and dependencies.
-    """
+    """Check worksheet editor API health and dependencies."""
     status = {
         "status": "healthy",
         "ollama": False,
@@ -954,12 +330,11 @@ async def health_check():
         "worksheets_count": len(worksheets_db)
     }
 
-    # Check Ollama
     try:
         async with httpx.AsyncClient(timeout=5.0) as client:
             response = await client.get(f"{OLLAMA_URL}/api/tags")
             status["ollama"] = response.status_code == 200
-    except:
+    except Exception:
         pass
 
     return status
@@ -969,221 +344,15 @@ async def health_check():
 # DOCUMENT RECONSTRUCTION FROM VOCAB SESSION
 # =============================================
 
-class ReconstructRequest(BaseModel):
-    session_id: str
-    page_number: int = 1
-    include_images: bool = True
-    regenerate_graphics: bool = False
-
-class ReconstructResponse(BaseModel):
-    canvas_json: str
-    page_width: int
-    page_height: int
-    elements_count: int
-    vocabulary_matched: int
-    message: str
-    error: Optional[str] = None
-
-
 @router.post("/reconstruct-from-session", response_model=ReconstructResponse)
 async def reconstruct_document_from_session(request: ReconstructRequest):
     """
     Reconstruct a document from a vocab session into Fabric.js canvas format.
 
-    This endpoint:
-    1. Loads the original PDF from the vocab session
-    2. Runs OCR with position tracking
-    3. Uses vision LLM to understand layout (headers, images, columns)
-    4. Creates Fabric.js canvas JSON with positioned elements
-    5. Maps extracted vocabulary to their positions
-
     Returns canvas JSON ready to load into the worksheet editor.
     """
     try:
-        # Import vocab session storage
-        from vocab_worksheet_api import _sessions, convert_pdf_page_to_image
-
-        # Check if session exists
-        if request.session_id not in _sessions:
-            raise HTTPException(status_code=404, detail=f"Session {request.session_id} not found")
-
-        session = _sessions[request.session_id]
-
-        # Check if PDF data exists
-        if not session.get("pdf_data"):
-            raise HTTPException(status_code=400, detail="Session has no PDF data")
-
-        pdf_data = session["pdf_data"]
-        page_count = session.get("pdf_page_count", 1)
-
-        if request.page_number < 1 or request.page_number > page_count:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Page {request.page_number} not found. PDF has {page_count} pages."
-            )
-
-        # Get extracted vocabulary for this page
-        vocabulary = session.get("vocabulary", [])
-        page_vocab = [v for v in vocabulary if v.get("source_page") == request.page_number]
-
-        logger.info(f"Reconstructing page {request.page_number} from session {request.session_id}")
-        logger.info(f"Found {len(page_vocab)} vocabulary items for this page")
-
-        # Convert PDF page to image (async function)
-        image_bytes = await convert_pdf_page_to_image(pdf_data, request.page_number)
-        if not image_bytes:
-            raise HTTPException(status_code=500, detail="Failed to convert PDF page to image")
-
-        # Get image dimensions
-        from PIL import Image
-        img = Image.open(io.BytesIO(image_bytes))
-        img_width, img_height = img.size
-
-        # Run OCR with positions
-        from hybrid_vocab_extractor import run_paddle_ocr, OCRRegion
-        ocr_regions, raw_text = run_paddle_ocr(image_bytes)
-
-        logger.info(f"OCR found {len(ocr_regions)} text regions")
-
-        # Scale factor: Convert image pixels to A4 canvas pixels (794x1123)
-        A4_WIDTH = 794
-        A4_HEIGHT = 1123
-        scale_x = A4_WIDTH / img_width
-        scale_y = A4_HEIGHT / img_height
-
-        # Build Fabric.js objects
-        fabric_objects = []
-
-        # 1. Add white background
-        fabric_objects.append({
-            "type": "rect",
-            "left": 0,
-            "top": 0,
-            "width": A4_WIDTH,
-            "height": A4_HEIGHT,
-            "fill": "#ffffff",
-            "selectable": False,
-            "evented": False,
-            "isBackground": True
-        })
-
-        # 2. Group OCR regions by Y-coordinate to detect rows
-        sorted_regions = sorted(ocr_regions, key=lambda r: (r.y1, r.x1))
-
-        # 3. Detect headers (larger text at top)
-        headers = []
-        body_regions = []
-
-        for region in sorted_regions:
-            height = region.y2 - region.y1
-            # Headers are typically taller and near the top
-            if region.y1 < img_height * 0.15 and height > 30:
-                headers.append(region)
-            else:
-                body_regions.append(region)
-
-        # 4. Create text objects for each region
-        vocab_matched = 0
-
-        for region in sorted_regions:
-            # Scale positions to A4
-            left = int(region.x1 * scale_x)
-            top = int(region.y1 * scale_y)
-
-            # Determine if this is a header
-            is_header = region in headers
-
-            # Determine font size based on region height
-            region_height = region.y2 - region.y1
-            base_font_size = max(10, min(32, int(region_height * scale_y * 0.8)))
-
-            if is_header:
-                base_font_size = max(base_font_size, 24)
-
-            # Check if this text matches vocabulary
-            is_vocab = False
-            vocab_match = None
-            for v in page_vocab:
-                if v.get("english", "").lower() in region.text.lower() or \
-                   v.get("german", "").lower() in region.text.lower():
-                    is_vocab = True
-                    vocab_match = v
-                    vocab_matched += 1
-                    break
-
-            # Create Fabric.js text object
-            text_obj = {
-                "type": "i-text",
-                "id": f"text_{uuid.uuid4().hex[:8]}",
-                "left": left,
-                "top": top,
-                "text": region.text,
-                "fontFamily": "Arial",
-                "fontSize": base_font_size,
-                "fontWeight": "bold" if is_header else "normal",
-                "fill": "#000000",
-                "originX": "left",
-                "originY": "top",
-            }
-
-            # Add metadata for vocabulary items
-            if is_vocab and vocab_match:
-                text_obj["isVocabulary"] = True
-                text_obj["vocabularyId"] = vocab_match.get("id")
-                text_obj["english"] = vocab_match.get("english")
-                text_obj["german"] = vocab_match.get("german")
-
-            fabric_objects.append(text_obj)
-
-        # 5. If include_images, try to detect and extract image regions
-        if request.include_images:
-            image_regions = await _detect_image_regions(image_bytes, ocr_regions, img_width, img_height)
-
-            for i, img_region in enumerate(image_regions):
-                # Extract image region from original
-                img_x1 = int(img_region["x1"])
-                img_y1 = int(img_region["y1"])
-                img_x2 = int(img_region["x2"])
-                img_y2 = int(img_region["y2"])
-
-                # Crop the region
-                cropped = img.crop((img_x1, img_y1, img_x2, img_y2))
-
-                # Convert to base64
-                buffer = io.BytesIO()
-                cropped.save(buffer, format='PNG')
-                buffer.seek(0)
-                img_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
-
-                # Create Fabric.js image object
-                fabric_objects.append({
-                    "type": "image",
-                    "id": f"img_{uuid.uuid4().hex[:8]}",
-                    "left": int(img_x1 * scale_x),
-                    "top": int(img_y1 * scale_y),
-                    "width": int((img_x2 - img_x1) * scale_x),
-                    "height": int((img_y2 - img_y1) * scale_y),
-                    "src": img_base64,
-                    "scaleX": 1,
-                    "scaleY": 1,
-                })
-
-        # Build canvas JSON
-        canvas_data = {
-            "version": "6.0.0",
-            "objects": fabric_objects,
-            "background": "#ffffff"
-        }
-
-        return ReconstructResponse(
-            canvas_json=json.dumps(canvas_data),
-            page_width=A4_WIDTH,
-            page_height=A4_HEIGHT,
-            elements_count=len(fabric_objects),
-            vocabulary_matched=vocab_matched,
-            message=f"Reconstructed page {request.page_number} with {len(fabric_objects)} elements, {vocab_matched} vocabulary items matched"
-        )
-
+        return await reconstruct_document_logic(request)
     except HTTPException:
         raise
     except Exception as e:
@@ -1193,101 +362,15 @@ async def reconstruct_document_from_session(request: ReconstructRequest):
         raise HTTPException(status_code=500, detail=str(e))
 
 
-async def _detect_image_regions(
-    image_bytes: bytes,
-    ocr_regions: list,
-    img_width: int,
-    img_height: int
-) -> List[Dict]:
-    """
-    Detect image/graphic regions in the document.
-
-    Uses a simple approach:
-    1. Find large gaps between text regions (potential image areas)
-    2. Use edge detection to find bounded regions
-    3. Filter out text areas
-    """
-    from PIL import Image
-    import numpy as np
-
-    try:
-        img = Image.open(io.BytesIO(image_bytes))
-        img_array = np.array(img.convert('L'))  # Grayscale
-
-        # Create a mask of text regions
-        text_mask = np.ones_like(img_array, dtype=bool)
-        for region in ocr_regions:
-            x1 = max(0, region.x1 - 5)
-            y1 = max(0, region.y1 - 5)
-            x2 = min(img_width, region.x2 + 5)
-            y2 = min(img_height, region.y2 + 5)
-            text_mask[y1:y2, x1:x2] = False
-
-        # Find contours in non-text areas
-        # Simple approach: look for rectangular regions with significant content
-        image_regions = []
-
-        # Use edge detection
-        import cv2
-        edges = cv2.Canny(img_array, 50, 150)
-
-        # Apply text mask
-        edges[~text_mask] = 0
-
-        # Find contours
-        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
-        for contour in contours:
-            x, y, w, h = cv2.boundingRect(contour)
-
-            # Filter: minimum size for images (at least 50x50 pixels)
-            if w > 50 and h > 50:
-                # Filter: not too large (not the whole page)
-                if w < img_width * 0.9 and h < img_height * 0.9:
-                    # Check if this region has actual content (not just edges)
-                    region_content = img_array[y:y+h, x:x+w]
-                    variance = np.var(region_content)
-
-                    if variance > 500:  # Has enough visual content
-                        image_regions.append({
-                            "x1": x,
-                            "y1": y,
-                            "x2": x + w,
-                            "y2": y + h
-                        })
-
-        # Remove overlapping regions (keep larger ones)
-        filtered_regions = []
-        for region in sorted(image_regions, key=lambda r: (r["x2"]-r["x1"])*(r["y2"]-r["y1"]), reverse=True):
-            overlaps = False
-            for existing in filtered_regions:
-                # Check overlap
-                if not (region["x2"] < existing["x1"] or region["x1"] > existing["x2"] or
-                        region["y2"] < existing["y1"] or region["y1"] > existing["y2"]):
-                    overlaps = True
-                    break
-            if not overlaps:
-                filtered_regions.append(region)
-
-        logger.info(f"Detected {len(filtered_regions)} image regions")
-        return filtered_regions[:10]  # Limit to 10 images max
-
-    except Exception as e:
-        logger.warning(f"Image region detection failed: {e}")
-        return []
-
-
 @router.get("/sessions/available")
 async def get_available_sessions():
-    """
-    Get list of available vocab sessions that can be reconstructed.
-    """
+    """Get list of available vocab sessions that can be reconstructed."""
     try:
         from vocab_worksheet_api import _sessions
 
         available = []
         for session_id, session in _sessions.items():
-            if session.get("pdf_data"):  # Only sessions with PDF
+            if session.get("pdf_data"):
                 available.append({
                     "id": session_id,
                     "name": session.get("name", "Unnamed"),
diff --git a/klausur-service/backend/worksheet_editor_models.py b/klausur-service/backend/worksheet_editor_models.py
new file mode 100644
index 0000000..468d36e
--- /dev/null
+++ b/klausur-service/backend/worksheet_editor_models.py
@@ -0,0 +1,133 @@
+"""
+Worksheet Editor Models — Enums, Pydantic models, and configuration.
+"""
+
+import os
+import logging
+from typing import Optional, List, Dict
+from enum import Enum
+
+from pydantic import BaseModel, Field
+
+logger = logging.getLogger(__name__)
+
+# =============================================
+# CONFIGURATION
+# =============================================
+
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+SD_MODEL = os.getenv("SD_MODEL", "stable-diffusion")  # or specific SD model
+WORKSHEET_STORAGE_DIR = os.getenv("WORKSHEET_STORAGE_DIR",
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "worksheet-storage"))
+
+# Ensure storage directory exists
+os.makedirs(WORKSHEET_STORAGE_DIR, exist_ok=True)
+
+# =============================================
+# ENUMS & MODELS
+# =============================================
+
+class AIImageStyle(str, Enum):
+    REALISTIC = "realistic"
+    CARTOON = "cartoon"
+    SKETCH = "sketch"
+    CLIPART = "clipart"
+    EDUCATIONAL = "educational"
+
+class WorksheetStatus(str, Enum):
+    DRAFT = "draft"
+    PUBLISHED = "published"
+    ARCHIVED = "archived"
+
+# Style prompt modifiers
+STYLE_PROMPTS = {
+    AIImageStyle.REALISTIC: "photorealistic, high detail, professional photography",
+    AIImageStyle.CARTOON: "cartoon style, colorful, child-friendly, simple shapes",
+    AIImageStyle.SKETCH: "pencil sketch, hand-drawn, black and white, artistic",
+    AIImageStyle.CLIPART: "clipart style, flat design, simple, vector-like",
+    AIImageStyle.EDUCATIONAL: "educational illustration, clear, informative, textbook style"
+}
+
+# =============================================
+# REQUEST/RESPONSE MODELS
+# =============================================
+
+class AIImageRequest(BaseModel):
+    prompt: str = Field(..., min_length=3, max_length=500)
+    style: AIImageStyle = AIImageStyle.EDUCATIONAL
+    width: int = Field(512, ge=256, le=1024)
+    height: int = Field(512, ge=256, le=1024)
+
+class AIImageResponse(BaseModel):
+    image_base64: str
+    prompt_used: str
+    error: Optional[str] = None
+
+class PageData(BaseModel):
+    id: str
+    index: int
+    canvasJSON: str
+
+class PageFormat(BaseModel):
+    width: float = 210
+    height: float = 297
+    orientation: str = "portrait"
+    margins: Dict[str, float] = {"top": 15, "right": 15, "bottom": 15, "left": 15}
+
+class WorksheetSaveRequest(BaseModel):
+    id: Optional[str] = None
+    title: str
+    description: Optional[str] = None
+    pages: List[PageData]
+    pageFormat: Optional[PageFormat] = None
+
+class WorksheetResponse(BaseModel):
+    id: str
+    title: str
+    description: Optional[str]
+    pages: List[PageData]
+    pageFormat: PageFormat
+    createdAt: str
+    updatedAt: str
+
+class AIModifyRequest(BaseModel):
+    prompt: str = Field(..., min_length=3, max_length=1000)
+    canvas_json: str
+    model: str = "qwen2.5vl:32b"
+
+class AIModifyResponse(BaseModel):
+    modified_canvas_json: Optional[str] = None
+    message: str
+    error: Optional[str] = None
+
+class ReconstructRequest(BaseModel):
+    session_id: str
+    page_number: int = 1
+    include_images: bool = True
+    regenerate_graphics: bool = False
+
+class ReconstructResponse(BaseModel):
+    canvas_json: str
+    page_width: int
+    page_height: int
+    elements_count: int
+    vocabulary_matched: int
+    message: str
+    error: Optional[str] = None
+
+# =============================================
+# IN-MEMORY STORAGE (Development)
+# =============================================
+
+worksheets_db: Dict[str, Dict] = {}
+
+# PDF Generation availability
+try:
+    from reportlab.lib import colors  # noqa: F401
+    from reportlab.lib.pagesizes import A4  # noqa: F401
+    from reportlab.lib.units import mm  # noqa: F401
+    from reportlab.pdfgen import canvas  # noqa: F401
+    from reportlab.lib.styles import getSampleStyleSheet  # noqa: F401
+    REPORTLAB_AVAILABLE = True
+except ImportError:
+    REPORTLAB_AVAILABLE = False
diff --git a/klausur-service/backend/worksheet_editor_reconstruct.py b/klausur-service/backend/worksheet_editor_reconstruct.py
new file mode 100644
index 0000000..b17f2c2
--- /dev/null
+++ b/klausur-service/backend/worksheet_editor_reconstruct.py
@@ -0,0 +1,255 @@
+"""
+Worksheet Editor Reconstruct — Document reconstruction from vocab sessions.
+"""
+
+import io
+import uuid
+import base64
+import logging
+from typing import List, Dict
+
+import numpy as np
+
+from worksheet_editor_models import (
+    ReconstructRequest,
+    ReconstructResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def reconstruct_document_logic(request: ReconstructRequest) -> ReconstructResponse:
+    """
+    Reconstruct a document from a vocab session into Fabric.js canvas format.
+
+    This function:
+    1. Loads the original PDF from the vocab session
+    2. Runs OCR with position tracking
+    3. Creates Fabric.js canvas JSON with positioned elements
+    4. Maps extracted vocabulary to their positions
+
+    Returns ReconstructResponse ready to send to the client.
+    """
+    from fastapi import HTTPException
+    from vocab_worksheet_api import _sessions, convert_pdf_page_to_image
+
+    # Check if session exists
+    if request.session_id not in _sessions:
+        raise HTTPException(status_code=404, detail=f"Session {request.session_id} not found")
+
+    session = _sessions[request.session_id]
+
+    if not session.get("pdf_data"):
+        raise HTTPException(status_code=400, detail="Session has no PDF data")
+
+    pdf_data = session["pdf_data"]
+    page_count = session.get("pdf_page_count", 1)
+
+    if request.page_number < 1 or request.page_number > page_count:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Page {request.page_number} not found. PDF has {page_count} pages."
+        )
+
+    vocabulary = session.get("vocabulary", [])
+    page_vocab = [v for v in vocabulary if v.get("source_page") == request.page_number]
+
+    logger.info(f"Reconstructing page {request.page_number} from session {request.session_id}")
+    logger.info(f"Found {len(page_vocab)} vocabulary items for this page")
+
+    image_bytes = await convert_pdf_page_to_image(pdf_data, request.page_number)
+    if not image_bytes:
+        raise HTTPException(status_code=500, detail="Failed to convert PDF page to image")
+
+    from PIL import Image
+    img = Image.open(io.BytesIO(image_bytes))
+    img_width, img_height = img.size
+
+    from hybrid_vocab_extractor import run_paddle_ocr
+    ocr_regions, raw_text = run_paddle_ocr(image_bytes)
+
+    logger.info(f"OCR found {len(ocr_regions)} text regions")
+
+    A4_WIDTH = 794
+    A4_HEIGHT = 1123
+    scale_x = A4_WIDTH / img_width
+    scale_y = A4_HEIGHT / img_height
+
+    fabric_objects = []
+
+    # 1. Add white background
+    fabric_objects.append({
+        "type": "rect", "left": 0, "top": 0,
+        "width": A4_WIDTH, "height": A4_HEIGHT,
+        "fill": "#ffffff", "selectable": False,
+        "evented": False, "isBackground": True
+    })
+
+    # 2. Group OCR regions by Y-coordinate to detect rows
+    sorted_regions = sorted(ocr_regions, key=lambda r: (r.y1, r.x1))
+
+    # 3. Detect headers (larger text at top)
+    headers = []
+    for region in sorted_regions:
+        height = region.y2 - region.y1
+        if region.y1 < img_height * 0.15 and height > 30:
+            headers.append(region)
+
+    # 4. Create text objects for each region
+    vocab_matched = 0
+
+    for region in sorted_regions:
+        left = int(region.x1 * scale_x)
+        top = int(region.y1 * scale_y)
+
+        is_header = region in headers
+
+        region_height = region.y2 - region.y1
+        base_font_size = max(10, min(32, int(region_height * scale_y * 0.8)))
+
+        if is_header:
+            base_font_size = max(base_font_size, 24)
+
+        is_vocab = False
+        vocab_match = None
+        for v in page_vocab:
+            if v.get("english", "").lower() in region.text.lower() or \
+               v.get("german", "").lower() in region.text.lower():
+                is_vocab = True
+                vocab_match = v
+                vocab_matched += 1
+                break
+
+        text_obj = {
+            "type": "i-text",
+            "id": f"text_{uuid.uuid4().hex[:8]}",
+            "left": left, "top": top,
+            "text": region.text,
+            "fontFamily": "Arial",
+            "fontSize": base_font_size,
+            "fontWeight": "bold" if is_header else "normal",
+            "fill": "#000000",
+            "originX": "left", "originY": "top",
+        }
+
+        if is_vocab and vocab_match:
+            text_obj["isVocabulary"] = True
+            text_obj["vocabularyId"] = vocab_match.get("id")
+            text_obj["english"] = vocab_match.get("english")
+            text_obj["german"] = vocab_match.get("german")
+
+        fabric_objects.append(text_obj)
+
+    # 5. If include_images, detect and extract image regions
+    if request.include_images:
+        image_regions = await _detect_image_regions(image_bytes, ocr_regions, img_width, img_height)
+
+        for i, img_region in enumerate(image_regions):
+            img_x1 = int(img_region["x1"])
+            img_y1 = int(img_region["y1"])
+            img_x2 = int(img_region["x2"])
+            img_y2 = int(img_region["y2"])
+
+            cropped = img.crop((img_x1, img_y1, img_x2, img_y2))
+
+            buffer = io.BytesIO()
+            cropped.save(buffer, format='PNG')
+            buffer.seek(0)
+            img_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
+
+            fabric_objects.append({
+                "type": "image",
+                "id": f"img_{uuid.uuid4().hex[:8]}",
+                "left": int(img_x1 * scale_x),
+                "top": int(img_y1 * scale_y),
+                "width": int((img_x2 - img_x1) * scale_x),
+                "height": int((img_y2 - img_y1) * scale_y),
+                "src": img_base64,
+                "scaleX": 1, "scaleY": 1,
+            })
+
+    import json
+    canvas_data = {
+        "version": "6.0.0",
+        "objects": fabric_objects,
+        "background": "#ffffff"
+    }
+
+    return ReconstructResponse(
+        canvas_json=json.dumps(canvas_data),
+        page_width=A4_WIDTH,
+        page_height=A4_HEIGHT,
+        elements_count=len(fabric_objects),
+        vocabulary_matched=vocab_matched,
+        message=f"Reconstructed page {request.page_number} with {len(fabric_objects)} elements, "
+                f"{vocab_matched} vocabulary items matched"
+    )
+
+
+async def _detect_image_regions(
+    image_bytes: bytes,
+    ocr_regions: list,
+    img_width: int,
+    img_height: int
+) -> List[Dict]:
+    """
+    Detect image/graphic regions in the document.
+
+    Uses a simple approach:
+    1. Find large gaps between text regions (potential image areas)
+    2. Use edge detection to find bounded regions
+    3. Filter out text areas
+    """
+    from PIL import Image
+    import cv2
+
+    try:
+        img = Image.open(io.BytesIO(image_bytes))
+        img_array = np.array(img.convert('L'))
+
+        text_mask = np.ones_like(img_array, dtype=bool)
+        for region in ocr_regions:
+            x1 = max(0, region.x1 - 5)
+            y1 = max(0, region.y1 - 5)
+            x2 = min(img_width, region.x2 + 5)
+            y2 = min(img_height, region.y2 + 5)
+            text_mask[y1:y2, x1:x2] = False
+
+        image_regions = []
+
+        edges = cv2.Canny(img_array, 50, 150)
+        edges[~text_mask] = 0
+
+        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+        for contour in contours:
+            x, y, w, h = cv2.boundingRect(contour)
+
+            if w > 50 and h > 50:
+                if w < img_width * 0.9 and h < img_height * 0.9:
+                    region_content = img_array[y:y+h, x:x+w]
+                    variance = np.var(region_content)
+
+                    if variance > 500:
+                        image_regions.append({
+                            "x1": x, "y1": y,
+                            "x2": x + w, "y2": y + h
+                        })
+
+        filtered_regions = []
+        for region in sorted(image_regions, key=lambda r: (r["x2"]-r["x1"])*(r["y2"]-r["y1"]), reverse=True):
+            overlaps = False
+            for existing in filtered_regions:
+                if not (region["x2"] < existing["x1"] or region["x1"] > existing["x2"] or
+                        region["y2"] < existing["y1"] or region["y1"] > existing["y2"]):
+                    overlaps = True
+                    break
+            if not overlaps:
+                filtered_regions.append(region)
+
+        logger.info(f"Detected {len(filtered_regions)} image regions")
+        return filtered_regions[:10]
+
+    except Exception as e:
+        logger.warning(f"Image region detection failed: {e}")
+        return []