diff --git a/.claude/rules/loc-exceptions.txt b/.claude/rules/loc-exceptions.txt
index 76b4aa7..c3cced7 100644
--- a/.claude/rules/loc-exceptions.txt
+++ b/.claude/rules/loc-exceptions.txt
@@ -17,6 +17,7 @@
# Pure Data Registries (keine Logik, nur Daten-Definitionen)
**/dsfa_sources_registry.py | owner=klausur | reason=Pure data registry (license + source definitions, no logic) | review=2027-01-01
+**/legal_corpus_registry.py | owner=klausur | reason=Pure data registry (Regulation dataclass + 47 regulation definitions, no logic) | review=2027-01-01
**/backlog/backlog-items.ts | owner=admin-lehrer | reason=Pure data array (506 LOC, no logic, only BacklogItem[] literals) | review=2027-01-01
**/lib/module-registry-data.ts | owner=admin-lehrer | reason=Pure data array (510 LOC, no logic, only BackendModule[] literals) | review=2027-01-01
diff --git a/backend-lehrer/classroom_engine/repository.py b/backend-lehrer/classroom_engine/repository.py
index 66a0302..777d714 100644
--- a/backend-lehrer/classroom_engine/repository.py
+++ b/backend-lehrer/classroom_engine/repository.py
@@ -1,1705 +1,33 @@
"""
-Session Repository - CRUD Operationen fuer Classroom Sessions (Feature f14).
+Session Repository - Re-export Hub.
-Abstraktion der Datenbank-Operationen fuer LessonSessions.
+Alle Repository-Klassen werden aus ihren Domain-Modulen re-exportiert,
+damit bestehende Imports unveraendert funktionieren:
+
+ from classroom_engine.repository import SessionRepository
+ from .repository import TeacherContextRepository
"""
-from datetime import datetime
-from typing import Optional, List, Dict, Any
-from sqlalchemy.orm import Session as DBSession
-
-from .db_models import (
- LessonSessionDB, PhaseHistoryDB, LessonTemplateDB, TeacherSettingsDB,
- LessonPhaseEnum, HomeworkDB, HomeworkStatusEnum, PhaseMaterialDB, MaterialTypeEnum,
- LessonReflectionDB, TeacherFeedbackDB, FeedbackTypeEnum, FeedbackStatusEnum,
- FeedbackPriorityEnum
-)
-from .context_models import (
- TeacherContextDB, SchoolyearEventDB, RecurringRoutineDB,
- MacroPhaseEnum, EventTypeEnum, EventStatusEnum,
- RoutineTypeEnum, RecurrencePatternEnum,
- FEDERAL_STATES, SCHOOL_TYPES
-)
-from .models import (
- LessonSession, LessonTemplate, LessonPhase, Homework, HomeworkStatus,
- PhaseMaterial, MaterialType, get_default_durations
-)
-from .analytics import (
- LessonReflection, SessionSummary, TeacherAnalytics, AnalyticsCalculator
+from .repository_session import SessionRepository, TeacherSettingsRepository
+from .repository_template import TemplateRepository
+from .repository_homework import HomeworkRepository, MaterialRepository
+from .repository_reflection import ReflectionRepository, AnalyticsRepository
+from .repository_feedback import TeacherFeedbackRepository
+from .repository_context import (
+ TeacherContextRepository,
+ SchoolyearEventRepository,
+ RecurringRoutineRepository,
)
-
-class SessionRepository:
- """Repository fuer LessonSession CRUD-Operationen."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- # ==================== CREATE ====================
-
- def create(self, session: LessonSession) -> LessonSessionDB:
- """
- Erstellt eine neue Session in der Datenbank.
-
- Args:
- session: LessonSession Dataclass
-
- Returns:
- LessonSessionDB Model
- """
- db_session = LessonSessionDB(
- id=session.session_id,
- teacher_id=session.teacher_id,
- class_id=session.class_id,
- subject=session.subject,
- topic=session.topic,
- current_phase=LessonPhaseEnum(session.current_phase.value),
- is_paused=session.is_paused,
- lesson_started_at=session.lesson_started_at,
- lesson_ended_at=session.lesson_ended_at,
- phase_started_at=session.phase_started_at,
- pause_started_at=session.pause_started_at,
- total_paused_seconds=session.total_paused_seconds,
- phase_durations=session.phase_durations,
- phase_history=session.phase_history,
- notes=session.notes,
- homework=session.homework,
- )
- self.db.add(db_session)
- self.db.commit()
- self.db.refresh(db_session)
- return db_session
-
- # ==================== READ ====================
-
- def get_by_id(self, session_id: str) -> Optional[LessonSessionDB]:
- """Holt eine Session nach ID."""
- return self.db.query(LessonSessionDB).filter(
- LessonSessionDB.id == session_id
- ).first()
-
- def get_active_by_teacher(self, teacher_id: str) -> List[LessonSessionDB]:
- """Holt alle aktiven Sessions eines Lehrers."""
- return self.db.query(LessonSessionDB).filter(
- LessonSessionDB.teacher_id == teacher_id,
- LessonSessionDB.current_phase != LessonPhaseEnum.ENDED
- ).all()
-
- def get_history_by_teacher(
- self,
- teacher_id: str,
- limit: int = 20,
- offset: int = 0
- ) -> List[LessonSessionDB]:
- """Holt Session-History eines Lehrers (Feature f17)."""
- return self.db.query(LessonSessionDB).filter(
- LessonSessionDB.teacher_id == teacher_id,
- LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
- ).order_by(
- LessonSessionDB.lesson_ended_at.desc()
- ).offset(offset).limit(limit).all()
-
- def get_by_class(
- self,
- class_id: str,
- limit: int = 20
- ) -> List[LessonSessionDB]:
- """Holt Sessions einer Klasse."""
- return self.db.query(LessonSessionDB).filter(
- LessonSessionDB.class_id == class_id
- ).order_by(
- LessonSessionDB.created_at.desc()
- ).limit(limit).all()
-
- # ==================== UPDATE ====================
-
- def update(self, session: LessonSession) -> Optional[LessonSessionDB]:
- """
- Aktualisiert eine bestehende Session.
-
- Args:
- session: LessonSession Dataclass mit aktualisierten Werten
-
- Returns:
- Aktualisierte LessonSessionDB oder None
- """
- db_session = self.get_by_id(session.session_id)
- if not db_session:
- return None
-
- db_session.current_phase = LessonPhaseEnum(session.current_phase.value)
- db_session.is_paused = session.is_paused
- db_session.lesson_started_at = session.lesson_started_at
- db_session.lesson_ended_at = session.lesson_ended_at
- db_session.phase_started_at = session.phase_started_at
- db_session.pause_started_at = session.pause_started_at
- db_session.total_paused_seconds = session.total_paused_seconds
- db_session.phase_durations = session.phase_durations
- db_session.phase_history = session.phase_history
- db_session.notes = session.notes
- db_session.homework = session.homework
-
- self.db.commit()
- self.db.refresh(db_session)
- return db_session
-
- def update_notes(
- self,
- session_id: str,
- notes: str,
- homework: str
- ) -> Optional[LessonSessionDB]:
- """Aktualisiert nur Notizen und Hausaufgaben."""
- db_session = self.get_by_id(session_id)
- if not db_session:
- return None
-
- db_session.notes = notes
- db_session.homework = homework
-
- self.db.commit()
- self.db.refresh(db_session)
- return db_session
-
- # ==================== DELETE ====================
-
- def delete(self, session_id: str) -> bool:
- """Loescht eine Session."""
- db_session = self.get_by_id(session_id)
- if not db_session:
- return False
-
- self.db.delete(db_session)
- self.db.commit()
- return True
-
- # ==================== CONVERSION ====================
-
- def to_dataclass(self, db_session: LessonSessionDB) -> LessonSession:
- """
- Konvertiert DB-Model zu Dataclass.
-
- Args:
- db_session: LessonSessionDB Model
-
- Returns:
- LessonSession Dataclass
- """
- return LessonSession(
- session_id=db_session.id,
- teacher_id=db_session.teacher_id,
- class_id=db_session.class_id,
- subject=db_session.subject,
- topic=db_session.topic,
- current_phase=LessonPhase(db_session.current_phase.value),
- phase_started_at=db_session.phase_started_at,
- lesson_started_at=db_session.lesson_started_at,
- lesson_ended_at=db_session.lesson_ended_at,
- is_paused=db_session.is_paused,
- pause_started_at=db_session.pause_started_at,
- total_paused_seconds=db_session.total_paused_seconds or 0,
- phase_durations=db_session.phase_durations or get_default_durations(),
- phase_history=db_session.phase_history or [],
- notes=db_session.notes or "",
- homework=db_session.homework or "",
- )
-
-
-class TeacherSettingsRepository:
- """Repository fuer Lehrer-Einstellungen (Feature f16)."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- def get_or_create(self, teacher_id: str) -> TeacherSettingsDB:
- """Holt oder erstellt Einstellungen fuer einen Lehrer."""
- settings = self.db.query(TeacherSettingsDB).filter(
- TeacherSettingsDB.teacher_id == teacher_id
- ).first()
-
- if not settings:
- settings = TeacherSettingsDB(
- teacher_id=teacher_id,
- default_phase_durations=get_default_durations(),
- )
- self.db.add(settings)
- self.db.commit()
- self.db.refresh(settings)
-
- return settings
-
- def update_phase_durations(
- self,
- teacher_id: str,
- durations: Dict[str, int]
- ) -> TeacherSettingsDB:
- """Aktualisiert die Standard-Phasendauern."""
- settings = self.get_or_create(teacher_id)
- settings.default_phase_durations = durations
- self.db.commit()
- self.db.refresh(settings)
- return settings
-
- def update_preferences(
- self,
- teacher_id: str,
- audio_enabled: Optional[bool] = None,
- high_contrast: Optional[bool] = None,
- show_statistics: Optional[bool] = None
- ) -> TeacherSettingsDB:
- """Aktualisiert UI-Praeferenzen."""
- settings = self.get_or_create(teacher_id)
-
- if audio_enabled is not None:
- settings.audio_enabled = audio_enabled
- if high_contrast is not None:
- settings.high_contrast = high_contrast
- if show_statistics is not None:
- settings.show_statistics = show_statistics
-
- self.db.commit()
- self.db.refresh(settings)
- return settings
-
-
-class TemplateRepository:
- """Repository fuer Stunden-Vorlagen (Feature f37)."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- # ==================== CREATE ====================
-
- def create(self, template: LessonTemplate) -> LessonTemplateDB:
- """Erstellt eine neue Vorlage."""
- db_template = LessonTemplateDB(
- id=template.template_id,
- teacher_id=template.teacher_id,
- name=template.name,
- description=template.description,
- subject=template.subject,
- grade_level=template.grade_level,
- phase_durations=template.phase_durations,
- default_topic=template.default_topic,
- default_notes=template.default_notes,
- is_public=template.is_public,
- usage_count=template.usage_count,
- )
- self.db.add(db_template)
- self.db.commit()
- self.db.refresh(db_template)
- return db_template
-
- # ==================== READ ====================
-
- def get_by_id(self, template_id: str) -> Optional[LessonTemplateDB]:
- """Holt eine Vorlage nach ID."""
- return self.db.query(LessonTemplateDB).filter(
- LessonTemplateDB.id == template_id
- ).first()
-
- def get_by_teacher(
- self,
- teacher_id: str,
- include_public: bool = True
- ) -> List[LessonTemplateDB]:
- """
- Holt alle Vorlagen eines Lehrers.
-
- Args:
- teacher_id: ID des Lehrers
- include_public: Auch oeffentliche Vorlagen anderer Lehrer einbeziehen
- """
- if include_public:
- return self.db.query(LessonTemplateDB).filter(
- (LessonTemplateDB.teacher_id == teacher_id) |
- (LessonTemplateDB.is_public == True)
- ).order_by(
- LessonTemplateDB.usage_count.desc()
- ).all()
- else:
- return self.db.query(LessonTemplateDB).filter(
- LessonTemplateDB.teacher_id == teacher_id
- ).order_by(
- LessonTemplateDB.created_at.desc()
- ).all()
-
- def get_public_templates(self, limit: int = 20) -> List[LessonTemplateDB]:
- """Holt oeffentliche Vorlagen, sortiert nach Beliebtheit."""
- return self.db.query(LessonTemplateDB).filter(
- LessonTemplateDB.is_public == True
- ).order_by(
- LessonTemplateDB.usage_count.desc()
- ).limit(limit).all()
-
- def get_by_subject(
- self,
- subject: str,
- teacher_id: Optional[str] = None
- ) -> List[LessonTemplateDB]:
- """Holt Vorlagen fuer ein bestimmtes Fach."""
- query = self.db.query(LessonTemplateDB).filter(
- LessonTemplateDB.subject == subject
- )
- if teacher_id:
- query = query.filter(
- (LessonTemplateDB.teacher_id == teacher_id) |
- (LessonTemplateDB.is_public == True)
- )
- else:
- query = query.filter(LessonTemplateDB.is_public == True)
-
- return query.order_by(
- LessonTemplateDB.usage_count.desc()
- ).all()
-
- # ==================== UPDATE ====================
-
- def update(self, template: LessonTemplate) -> Optional[LessonTemplateDB]:
- """Aktualisiert eine Vorlage."""
- db_template = self.get_by_id(template.template_id)
- if not db_template:
- return None
-
- db_template.name = template.name
- db_template.description = template.description
- db_template.subject = template.subject
- db_template.grade_level = template.grade_level
- db_template.phase_durations = template.phase_durations
- db_template.default_topic = template.default_topic
- db_template.default_notes = template.default_notes
- db_template.is_public = template.is_public
-
- self.db.commit()
- self.db.refresh(db_template)
- return db_template
-
- def increment_usage(self, template_id: str) -> Optional[LessonTemplateDB]:
- """Erhoeht den Usage-Counter einer Vorlage."""
- db_template = self.get_by_id(template_id)
- if not db_template:
- return None
-
- db_template.usage_count += 1
- self.db.commit()
- self.db.refresh(db_template)
- return db_template
-
- # ==================== DELETE ====================
-
- def delete(self, template_id: str) -> bool:
- """Loescht eine Vorlage."""
- db_template = self.get_by_id(template_id)
- if not db_template:
- return False
-
- self.db.delete(db_template)
- self.db.commit()
- return True
-
- # ==================== CONVERSION ====================
-
- def to_dataclass(self, db_template: LessonTemplateDB) -> LessonTemplate:
- """Konvertiert DB-Model zu Dataclass."""
- return LessonTemplate(
- template_id=db_template.id,
- teacher_id=db_template.teacher_id,
- name=db_template.name,
- description=db_template.description or "",
- subject=db_template.subject or "",
- grade_level=db_template.grade_level or "",
- phase_durations=db_template.phase_durations or get_default_durations(),
- default_topic=db_template.default_topic or "",
- default_notes=db_template.default_notes or "",
- is_public=db_template.is_public,
- usage_count=db_template.usage_count,
- created_at=db_template.created_at,
- updated_at=db_template.updated_at,
- )
-
-
-class HomeworkRepository:
- """Repository fuer Hausaufgaben-Tracking (Feature f20)."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- # ==================== CREATE ====================
-
- def create(self, homework: Homework) -> HomeworkDB:
- """Erstellt eine neue Hausaufgabe."""
- db_homework = HomeworkDB(
- id=homework.homework_id,
- teacher_id=homework.teacher_id,
- class_id=homework.class_id,
- subject=homework.subject,
- title=homework.title,
- description=homework.description,
- session_id=homework.session_id,
- due_date=homework.due_date,
- status=HomeworkStatusEnum(homework.status.value),
- )
- self.db.add(db_homework)
- self.db.commit()
- self.db.refresh(db_homework)
- return db_homework
-
- # ==================== READ ====================
-
- def get_by_id(self, homework_id: str) -> Optional[HomeworkDB]:
- """Holt eine Hausaufgabe nach ID."""
- return self.db.query(HomeworkDB).filter(
- HomeworkDB.id == homework_id
- ).first()
-
- def get_by_teacher(
- self,
- teacher_id: str,
- status: Optional[str] = None,
- limit: int = 50
- ) -> List[HomeworkDB]:
- """Holt alle Hausaufgaben eines Lehrers."""
- query = self.db.query(HomeworkDB).filter(
- HomeworkDB.teacher_id == teacher_id
- )
- if status:
- query = query.filter(HomeworkDB.status == HomeworkStatusEnum(status))
- return query.order_by(
- HomeworkDB.due_date.asc().nullslast(),
- HomeworkDB.created_at.desc()
- ).limit(limit).all()
-
- def get_by_class(
- self,
- class_id: str,
- teacher_id: str,
- include_completed: bool = False,
- limit: int = 20
- ) -> List[HomeworkDB]:
- """Holt alle Hausaufgaben einer Klasse."""
- query = self.db.query(HomeworkDB).filter(
- HomeworkDB.class_id == class_id,
- HomeworkDB.teacher_id == teacher_id
- )
- if not include_completed:
- query = query.filter(HomeworkDB.status != HomeworkStatusEnum.COMPLETED)
- return query.order_by(
- HomeworkDB.due_date.asc().nullslast(),
- HomeworkDB.created_at.desc()
- ).limit(limit).all()
-
- def get_by_session(self, session_id: str) -> List[HomeworkDB]:
- """Holt alle Hausaufgaben einer Session."""
- return self.db.query(HomeworkDB).filter(
- HomeworkDB.session_id == session_id
- ).order_by(HomeworkDB.created_at.desc()).all()
-
- def get_pending(
- self,
- teacher_id: str,
- days_ahead: int = 7
- ) -> List[HomeworkDB]:
- """Holt anstehende Hausaufgaben der naechsten X Tage."""
- from datetime import timedelta
- cutoff = datetime.utcnow() + timedelta(days=days_ahead)
- return self.db.query(HomeworkDB).filter(
- HomeworkDB.teacher_id == teacher_id,
- HomeworkDB.status.in_([HomeworkStatusEnum.ASSIGNED, HomeworkStatusEnum.IN_PROGRESS]),
- HomeworkDB.due_date <= cutoff
- ).order_by(HomeworkDB.due_date.asc()).all()
-
- # ==================== UPDATE ====================
-
- def update_status(
- self,
- homework_id: str,
- status: HomeworkStatus
- ) -> Optional[HomeworkDB]:
- """Aktualisiert den Status einer Hausaufgabe."""
- db_homework = self.get_by_id(homework_id)
- if not db_homework:
- return None
-
- db_homework.status = HomeworkStatusEnum(status.value)
- self.db.commit()
- self.db.refresh(db_homework)
- return db_homework
-
- def update(self, homework: Homework) -> Optional[HomeworkDB]:
- """Aktualisiert eine Hausaufgabe."""
- db_homework = self.get_by_id(homework.homework_id)
- if not db_homework:
- return None
-
- db_homework.title = homework.title
- db_homework.description = homework.description
- db_homework.due_date = homework.due_date
- db_homework.status = HomeworkStatusEnum(homework.status.value)
-
- self.db.commit()
- self.db.refresh(db_homework)
- return db_homework
-
- # ==================== DELETE ====================
-
- def delete(self, homework_id: str) -> bool:
- """Loescht eine Hausaufgabe."""
- db_homework = self.get_by_id(homework_id)
- if not db_homework:
- return False
-
- self.db.delete(db_homework)
- self.db.commit()
- return True
-
- # ==================== CONVERSION ====================
-
- def to_dataclass(self, db_homework: HomeworkDB) -> Homework:
- """Konvertiert DB-Model zu Dataclass."""
- return Homework(
- homework_id=db_homework.id,
- teacher_id=db_homework.teacher_id,
- class_id=db_homework.class_id,
- subject=db_homework.subject,
- title=db_homework.title,
- description=db_homework.description or "",
- session_id=db_homework.session_id,
- due_date=db_homework.due_date,
- status=HomeworkStatus(db_homework.status.value),
- created_at=db_homework.created_at,
- updated_at=db_homework.updated_at,
- )
-
-
-class MaterialRepository:
- """Repository fuer Phasen-Materialien (Feature f19)."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- # ==================== CREATE ====================
-
- def create(self, material: PhaseMaterial) -> PhaseMaterialDB:
- """Erstellt ein neues Material."""
- db_material = PhaseMaterialDB(
- id=material.material_id,
- teacher_id=material.teacher_id,
- title=material.title,
- material_type=MaterialTypeEnum(material.material_type.value),
- url=material.url,
- description=material.description,
- phase=material.phase,
- subject=material.subject,
- grade_level=material.grade_level,
- tags=material.tags,
- is_public=material.is_public,
- usage_count=material.usage_count,
- session_id=material.session_id,
- )
- self.db.add(db_material)
- self.db.commit()
- self.db.refresh(db_material)
- return db_material
-
- # ==================== READ ====================
-
- def get_by_id(self, material_id: str) -> Optional[PhaseMaterialDB]:
- """Holt ein Material nach ID."""
- return self.db.query(PhaseMaterialDB).filter(
- PhaseMaterialDB.id == material_id
- ).first()
-
- def get_by_teacher(
- self,
- teacher_id: str,
- phase: Optional[str] = None,
- subject: Optional[str] = None,
- limit: int = 50
- ) -> List[PhaseMaterialDB]:
- """Holt alle Materialien eines Lehrers."""
- query = self.db.query(PhaseMaterialDB).filter(
- PhaseMaterialDB.teacher_id == teacher_id
- )
- if phase:
- query = query.filter(PhaseMaterialDB.phase == phase)
- if subject:
- query = query.filter(PhaseMaterialDB.subject == subject)
-
- return query.order_by(
- PhaseMaterialDB.usage_count.desc(),
- PhaseMaterialDB.created_at.desc()
- ).limit(limit).all()
-
- def get_by_phase(
- self,
- phase: str,
- teacher_id: str,
- include_public: bool = True
- ) -> List[PhaseMaterialDB]:
- """Holt alle Materialien fuer eine bestimmte Phase."""
- if include_public:
- return self.db.query(PhaseMaterialDB).filter(
- PhaseMaterialDB.phase == phase,
- (PhaseMaterialDB.teacher_id == teacher_id) |
- (PhaseMaterialDB.is_public == True)
- ).order_by(
- PhaseMaterialDB.usage_count.desc()
- ).all()
- else:
- return self.db.query(PhaseMaterialDB).filter(
- PhaseMaterialDB.phase == phase,
- PhaseMaterialDB.teacher_id == teacher_id
- ).order_by(
- PhaseMaterialDB.created_at.desc()
- ).all()
-
- def get_by_session(self, session_id: str) -> List[PhaseMaterialDB]:
- """Holt alle Materialien einer Session."""
- return self.db.query(PhaseMaterialDB).filter(
- PhaseMaterialDB.session_id == session_id
- ).order_by(PhaseMaterialDB.phase, PhaseMaterialDB.created_at).all()
-
- def get_public_materials(
- self,
- phase: Optional[str] = None,
- subject: Optional[str] = None,
- limit: int = 20
- ) -> List[PhaseMaterialDB]:
- """Holt oeffentliche Materialien."""
- query = self.db.query(PhaseMaterialDB).filter(
- PhaseMaterialDB.is_public == True
- )
- if phase:
- query = query.filter(PhaseMaterialDB.phase == phase)
- if subject:
- query = query.filter(PhaseMaterialDB.subject == subject)
-
- return query.order_by(
- PhaseMaterialDB.usage_count.desc()
- ).limit(limit).all()
-
- def search_by_tags(
- self,
- tags: List[str],
- teacher_id: Optional[str] = None
- ) -> List[PhaseMaterialDB]:
- """Sucht Materialien nach Tags."""
- # SQLite/PostgreSQL JSON contains
- query = self.db.query(PhaseMaterialDB)
- if teacher_id:
- query = query.filter(
- (PhaseMaterialDB.teacher_id == teacher_id) |
- (PhaseMaterialDB.is_public == True)
- )
- else:
- query = query.filter(PhaseMaterialDB.is_public == True)
-
- # Filter by tags - vereinfachte Implementierung
- results = []
- for material in query.all():
- if material.tags and any(tag in material.tags for tag in tags):
- results.append(material)
- return results[:50]
-
- # ==================== UPDATE ====================
-
- def update(self, material: PhaseMaterial) -> Optional[PhaseMaterialDB]:
- """Aktualisiert ein Material."""
- db_material = self.get_by_id(material.material_id)
- if not db_material:
- return None
-
- db_material.title = material.title
- db_material.material_type = MaterialTypeEnum(material.material_type.value)
- db_material.url = material.url
- db_material.description = material.description
- db_material.phase = material.phase
- db_material.subject = material.subject
- db_material.grade_level = material.grade_level
- db_material.tags = material.tags
- db_material.is_public = material.is_public
-
- self.db.commit()
- self.db.refresh(db_material)
- return db_material
-
- def increment_usage(self, material_id: str) -> Optional[PhaseMaterialDB]:
- """Erhoeht den Usage-Counter eines Materials."""
- db_material = self.get_by_id(material_id)
- if not db_material:
- return None
-
- db_material.usage_count += 1
- self.db.commit()
- self.db.refresh(db_material)
- return db_material
-
- def attach_to_session(
- self,
- material_id: str,
- session_id: str
- ) -> Optional[PhaseMaterialDB]:
- """Verknuepft ein Material mit einer Session."""
- db_material = self.get_by_id(material_id)
- if not db_material:
- return None
-
- db_material.session_id = session_id
- db_material.usage_count += 1
- self.db.commit()
- self.db.refresh(db_material)
- return db_material
-
- # ==================== DELETE ====================
-
- def delete(self, material_id: str) -> bool:
- """Loescht ein Material."""
- db_material = self.get_by_id(material_id)
- if not db_material:
- return False
-
- self.db.delete(db_material)
- self.db.commit()
- return True
-
- # ==================== CONVERSION ====================
-
- def to_dataclass(self, db_material: PhaseMaterialDB) -> PhaseMaterial:
- """Konvertiert DB-Model zu Dataclass."""
- return PhaseMaterial(
- material_id=db_material.id,
- teacher_id=db_material.teacher_id,
- title=db_material.title,
- material_type=MaterialType(db_material.material_type.value),
- url=db_material.url,
- description=db_material.description or "",
- phase=db_material.phase,
- subject=db_material.subject or "",
- grade_level=db_material.grade_level or "",
- tags=db_material.tags or [],
- is_public=db_material.is_public,
- usage_count=db_material.usage_count,
- session_id=db_material.session_id,
- created_at=db_material.created_at,
- updated_at=db_material.updated_at,
- )
-
-
-# ==================== REFLECTION REPOSITORY (Phase 5) ====================
-
-class ReflectionRepository:
- """Repository fuer LessonReflection CRUD-Operationen."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- # ==================== CREATE ====================
-
- def create(self, reflection: LessonReflection) -> LessonReflectionDB:
- """Erstellt eine neue Reflection."""
- db_reflection = LessonReflectionDB(
- id=reflection.reflection_id,
- session_id=reflection.session_id,
- teacher_id=reflection.teacher_id,
- notes=reflection.notes,
- overall_rating=reflection.overall_rating,
- what_worked=reflection.what_worked,
- improvements=reflection.improvements,
- notes_for_next_lesson=reflection.notes_for_next_lesson,
- )
- self.db.add(db_reflection)
- self.db.commit()
- self.db.refresh(db_reflection)
- return db_reflection
-
- # ==================== READ ====================
-
- def get_by_id(self, reflection_id: str) -> Optional[LessonReflectionDB]:
- """Holt eine Reflection nach ID."""
- return self.db.query(LessonReflectionDB).filter(
- LessonReflectionDB.id == reflection_id
- ).first()
-
- def get_by_session(self, session_id: str) -> Optional[LessonReflectionDB]:
- """Holt die Reflection einer Session."""
- return self.db.query(LessonReflectionDB).filter(
- LessonReflectionDB.session_id == session_id
- ).first()
-
- def get_by_teacher(
- self,
- teacher_id: str,
- limit: int = 20,
- offset: int = 0
- ) -> List[LessonReflectionDB]:
- """Holt alle Reflections eines Lehrers."""
- return self.db.query(LessonReflectionDB).filter(
- LessonReflectionDB.teacher_id == teacher_id
- ).order_by(
- LessonReflectionDB.created_at.desc()
- ).offset(offset).limit(limit).all()
-
- # ==================== UPDATE ====================
-
- def update(self, reflection: LessonReflection) -> Optional[LessonReflectionDB]:
- """Aktualisiert eine Reflection."""
- db_reflection = self.get_by_id(reflection.reflection_id)
- if not db_reflection:
- return None
-
- db_reflection.notes = reflection.notes
- db_reflection.overall_rating = reflection.overall_rating
- db_reflection.what_worked = reflection.what_worked
- db_reflection.improvements = reflection.improvements
- db_reflection.notes_for_next_lesson = reflection.notes_for_next_lesson
-
- self.db.commit()
- self.db.refresh(db_reflection)
- return db_reflection
-
- # ==================== DELETE ====================
-
- def delete(self, reflection_id: str) -> bool:
- """Loescht eine Reflection."""
- db_reflection = self.get_by_id(reflection_id)
- if not db_reflection:
- return False
-
- self.db.delete(db_reflection)
- self.db.commit()
- return True
-
- # ==================== CONVERSION ====================
-
- def to_dataclass(self, db_reflection: LessonReflectionDB) -> LessonReflection:
- """Konvertiert DB-Model zu Dataclass."""
- return LessonReflection(
- reflection_id=db_reflection.id,
- session_id=db_reflection.session_id,
- teacher_id=db_reflection.teacher_id,
- notes=db_reflection.notes or "",
- overall_rating=db_reflection.overall_rating,
- what_worked=db_reflection.what_worked or [],
- improvements=db_reflection.improvements or [],
- notes_for_next_lesson=db_reflection.notes_for_next_lesson or "",
- created_at=db_reflection.created_at,
- updated_at=db_reflection.updated_at,
- )
-
-
-# ==================== ANALYTICS REPOSITORY (Phase 5) ====================
-
-class AnalyticsRepository:
- """Repository fuer Analytics-Abfragen."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- def get_session_summary(self, session_id: str) -> Optional[SessionSummary]:
- """
- Berechnet die Summary einer abgeschlossenen Session.
-
- Args:
- session_id: ID der Session
-
- Returns:
- SessionSummary oder None wenn Session nicht gefunden
- """
- db_session = self.db.query(LessonSessionDB).filter(
- LessonSessionDB.id == session_id
- ).first()
-
- if not db_session:
- return None
-
- # Session-Daten zusammenstellen
- session_data = {
- "session_id": db_session.id,
- "teacher_id": db_session.teacher_id,
- "class_id": db_session.class_id,
- "subject": db_session.subject,
- "topic": db_session.topic,
- "lesson_started_at": db_session.lesson_started_at,
- "lesson_ended_at": db_session.lesson_ended_at,
- "phase_durations": db_session.phase_durations or {},
- }
-
- # Phase History aus DB oder JSON
- phase_history = db_session.phase_history or []
-
- # Summary berechnen
- return AnalyticsCalculator.calculate_session_summary(
- session_data, phase_history
- )
-
- def get_teacher_analytics(
- self,
- teacher_id: str,
- period_start: Optional[datetime] = None,
- period_end: Optional[datetime] = None
- ) -> TeacherAnalytics:
- """
- Berechnet aggregierte Statistiken fuer einen Lehrer.
-
- Args:
- teacher_id: ID des Lehrers
- period_start: Beginn des Zeitraums (default: 30 Tage zurueck)
- period_end: Ende des Zeitraums (default: jetzt)
-
- Returns:
- TeacherAnalytics mit aggregierten Statistiken
- """
- from datetime import timedelta
-
- if not period_end:
- period_end = datetime.utcnow()
- if not period_start:
- period_start = period_end - timedelta(days=30)
-
- # Sessions im Zeitraum abfragen
- sessions_query = self.db.query(LessonSessionDB).filter(
- LessonSessionDB.teacher_id == teacher_id,
- LessonSessionDB.lesson_started_at >= period_start,
- LessonSessionDB.lesson_started_at <= period_end
- ).all()
-
- # Sessions zu Dictionaries konvertieren
- sessions_data = []
- for db_session in sessions_query:
- sessions_data.append({
- "session_id": db_session.id,
- "teacher_id": db_session.teacher_id,
- "class_id": db_session.class_id,
- "subject": db_session.subject,
- "topic": db_session.topic,
- "lesson_started_at": db_session.lesson_started_at,
- "lesson_ended_at": db_session.lesson_ended_at,
- "phase_durations": db_session.phase_durations or {},
- "phase_history": db_session.phase_history or [],
- })
-
- return AnalyticsCalculator.calculate_teacher_analytics(
- sessions_data, period_start, period_end
- )
-
- def get_phase_duration_trends(
- self,
- teacher_id: str,
- phase: str,
- limit: int = 20
- ) -> List[Dict[str, Any]]:
- """
- Gibt die Dauer-Trends fuer eine bestimmte Phase zurueck.
-
- Args:
- teacher_id: ID des Lehrers
- phase: Phasen-ID (einstieg, erarbeitung, etc.)
- limit: Max Anzahl der Datenpunkte
-
- Returns:
- Liste von Datenpunkten [{date, planned, actual, difference}]
- """
- sessions = self.db.query(LessonSessionDB).filter(
- LessonSessionDB.teacher_id == teacher_id,
- LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
- ).order_by(
- LessonSessionDB.lesson_ended_at.desc()
- ).limit(limit).all()
-
- trends = []
- for db_session in sessions:
- history = db_session.phase_history or []
- for entry in history:
- if entry.get("phase") == phase:
- planned = (db_session.phase_durations or {}).get(phase, 0) * 60
- actual = entry.get("duration_seconds", 0) or 0
- trends.append({
- "date": db_session.lesson_started_at.isoformat() if db_session.lesson_started_at else None,
- "session_id": db_session.id,
- "subject": db_session.subject,
- "planned_seconds": planned,
- "actual_seconds": actual,
- "difference_seconds": actual - planned,
- })
- break
-
- return list(reversed(trends)) # Chronologisch sortieren
-
- def get_overtime_analysis(
- self,
- teacher_id: str,
- limit: int = 30
- ) -> Dict[str, Any]:
- """
- Analysiert Overtime-Muster.
-
- Args:
- teacher_id: ID des Lehrers
- limit: Anzahl der zu analysierenden Sessions
-
- Returns:
- Dict mit Overtime-Statistiken pro Phase
- """
- sessions = self.db.query(LessonSessionDB).filter(
- LessonSessionDB.teacher_id == teacher_id,
- LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
- ).order_by(
- LessonSessionDB.lesson_ended_at.desc()
- ).limit(limit).all()
-
- phase_overtime: Dict[str, List[int]] = {
- "einstieg": [],
- "erarbeitung": [],
- "sicherung": [],
- "transfer": [],
- "reflexion": [],
- }
-
- for db_session in sessions:
- history = db_session.phase_history or []
- phase_durations = db_session.phase_durations or {}
-
- for entry in history:
- phase = entry.get("phase", "")
- if phase in phase_overtime:
- planned = phase_durations.get(phase, 0) * 60
- actual = entry.get("duration_seconds", 0) or 0
- overtime = max(0, actual - planned)
- phase_overtime[phase].append(overtime)
-
- # Statistiken berechnen
- result = {}
- for phase, overtimes in phase_overtime.items():
- if overtimes:
- result[phase] = {
- "count": len([o for o in overtimes if o > 0]),
- "total": len(overtimes),
- "avg_overtime_seconds": sum(overtimes) / len(overtimes),
- "max_overtime_seconds": max(overtimes),
- "overtime_percentage": len([o for o in overtimes if o > 0]) / len(overtimes) * 100,
- }
- else:
- result[phase] = {
- "count": 0,
- "total": 0,
- "avg_overtime_seconds": 0,
- "max_overtime_seconds": 0,
- "overtime_percentage": 0,
- }
-
- return result
-
-
-# ==================== TEACHER FEEDBACK REPOSITORY (Phase 7) ====================
-
-
-class TeacherFeedbackRepository:
- """
- Repository fuer Lehrer-Feedback CRUD-Operationen.
-
- Ermoeglicht Lehrern, Feedback (Bugs, Feature-Requests, Verbesserungen)
- direkt aus dem Lehrer-Frontend zu senden.
- """
-
- def __init__(self, db: DBSession):
- self.db = db
-
- def create(
- self,
- teacher_id: str,
- title: str,
- description: str,
- feedback_type: str = "improvement",
- priority: str = "medium",
- teacher_name: str = "",
- teacher_email: str = "",
- context_url: str = "",
- context_phase: str = "",
- context_session_id: str = None,
- user_agent: str = "",
- related_feature: str = None,
- ) -> TeacherFeedbackDB:
- """Erstellt neues Feedback."""
- import uuid
-
- db_feedback = TeacherFeedbackDB(
- id=str(uuid.uuid4()),
- teacher_id=teacher_id,
- teacher_name=teacher_name,
- teacher_email=teacher_email,
- title=title,
- description=description,
- feedback_type=FeedbackTypeEnum(feedback_type),
- priority=FeedbackPriorityEnum(priority),
- status=FeedbackStatusEnum.NEW,
- related_feature=related_feature,
- context_url=context_url,
- context_phase=context_phase,
- context_session_id=context_session_id,
- user_agent=user_agent,
- )
-
- self.db.add(db_feedback)
- self.db.commit()
- self.db.refresh(db_feedback)
- return db_feedback
-
- def get_by_id(self, feedback_id: str) -> Optional[TeacherFeedbackDB]:
- """Holt Feedback nach ID."""
- return self.db.query(TeacherFeedbackDB).filter(
- TeacherFeedbackDB.id == feedback_id
- ).first()
-
- def get_all(
- self,
- status: str = None,
- feedback_type: str = None,
- limit: int = 100,
- offset: int = 0
- ) -> List[TeacherFeedbackDB]:
- """Holt alle Feedbacks mit optionalen Filtern."""
- query = self.db.query(TeacherFeedbackDB)
-
- if status:
- query = query.filter(TeacherFeedbackDB.status == FeedbackStatusEnum(status))
- if feedback_type:
- query = query.filter(TeacherFeedbackDB.feedback_type == FeedbackTypeEnum(feedback_type))
-
- return query.order_by(
- TeacherFeedbackDB.created_at.desc()
- ).offset(offset).limit(limit).all()
-
- def get_by_teacher(self, teacher_id: str, limit: int = 50) -> List[TeacherFeedbackDB]:
- """Holt Feedback eines bestimmten Lehrers."""
- return self.db.query(TeacherFeedbackDB).filter(
- TeacherFeedbackDB.teacher_id == teacher_id
- ).order_by(
- TeacherFeedbackDB.created_at.desc()
- ).limit(limit).all()
-
- def update_status(
- self,
- feedback_id: str,
- status: str,
- response: str = None,
- responded_by: str = None
- ) -> Optional[TeacherFeedbackDB]:
- """Aktualisiert den Status eines Feedbacks."""
- db_feedback = self.get_by_id(feedback_id)
- if not db_feedback:
- return None
-
- db_feedback.status = FeedbackStatusEnum(status)
- if response:
- db_feedback.response = response
- db_feedback.responded_at = datetime.utcnow()
- db_feedback.responded_by = responded_by
-
- self.db.commit()
- self.db.refresh(db_feedback)
- return db_feedback
-
- def delete(self, feedback_id: str) -> bool:
- """Loescht ein Feedback."""
- db_feedback = self.get_by_id(feedback_id)
- if not db_feedback:
- return False
-
- self.db.delete(db_feedback)
- self.db.commit()
- return True
-
- def get_stats(self) -> Dict[str, Any]:
- """Gibt Statistiken ueber alle Feedbacks zurueck."""
- all_feedback = self.db.query(TeacherFeedbackDB).all()
-
- stats = {
- "total": len(all_feedback),
- "by_status": {},
- "by_type": {},
- "by_priority": {},
- }
-
- for fb in all_feedback:
- # By Status
- status = fb.status.value
- stats["by_status"][status] = stats["by_status"].get(status, 0) + 1
-
- # By Type
- fb_type = fb.feedback_type.value
- stats["by_type"][fb_type] = stats["by_type"].get(fb_type, 0) + 1
-
- # By Priority
- priority = fb.priority.value
- stats["by_priority"][priority] = stats["by_priority"].get(priority, 0) + 1
-
- return stats
-
- def to_dict(self, db_feedback: TeacherFeedbackDB) -> Dict[str, Any]:
- """Konvertiert DB-Model zu Dictionary."""
- return {
- "id": db_feedback.id,
- "teacher_id": db_feedback.teacher_id,
- "teacher_name": db_feedback.teacher_name,
- "teacher_email": db_feedback.teacher_email,
- "title": db_feedback.title,
- "description": db_feedback.description,
- "feedback_type": db_feedback.feedback_type.value,
- "priority": db_feedback.priority.value,
- "status": db_feedback.status.value,
- "related_feature": db_feedback.related_feature,
- "context_url": db_feedback.context_url,
- "context_phase": db_feedback.context_phase,
- "context_session_id": db_feedback.context_session_id,
- "user_agent": db_feedback.user_agent,
- "response": db_feedback.response,
- "responded_at": db_feedback.responded_at.isoformat() if db_feedback.responded_at else None,
- "responded_by": db_feedback.responded_by,
- "created_at": db_feedback.created_at.isoformat() if db_feedback.created_at else None,
- "updated_at": db_feedback.updated_at.isoformat() if db_feedback.updated_at else None,
- }
-
-
-# ==================== Phase 8: Teacher Context Repository ====================
-
-
-class TeacherContextRepository:
- """Repository fuer Lehrer-Kontext CRUD-Operationen (Phase 8)."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- # ==================== CREATE / GET-OR-CREATE ====================
-
- def get_or_create(self, teacher_id: str) -> TeacherContextDB:
- """
- Holt den Kontext eines Lehrers oder erstellt einen neuen.
-
- Args:
- teacher_id: ID des Lehrers
-
- Returns:
- TeacherContextDB Model
- """
- context = self.get_by_teacher_id(teacher_id)
- if context:
- return context
-
- # Neuen Kontext erstellen
- from uuid import uuid4
- context = TeacherContextDB(
- id=str(uuid4()),
- teacher_id=teacher_id,
- macro_phase=MacroPhaseEnum.ONBOARDING,
- )
- self.db.add(context)
- self.db.commit()
- self.db.refresh(context)
- return context
-
- # ==================== READ ====================
-
- def get_by_teacher_id(self, teacher_id: str) -> Optional[TeacherContextDB]:
- """Holt den Kontext eines Lehrers."""
- return self.db.query(TeacherContextDB).filter(
- TeacherContextDB.teacher_id == teacher_id
- ).first()
-
- # ==================== UPDATE ====================
-
- def update_context(
- self,
- teacher_id: str,
- federal_state: str = None,
- school_type: str = None,
- schoolyear: str = None,
- schoolyear_start: datetime = None,
- macro_phase: str = None,
- current_week: int = None,
- ) -> Optional[TeacherContextDB]:
- """Aktualisiert den Kontext eines Lehrers."""
- context = self.get_or_create(teacher_id)
-
- if federal_state is not None:
- context.federal_state = federal_state
- if school_type is not None:
- context.school_type = school_type
- if schoolyear is not None:
- context.schoolyear = schoolyear
- if schoolyear_start is not None:
- context.schoolyear_start = schoolyear_start
- if macro_phase is not None:
- context.macro_phase = MacroPhaseEnum(macro_phase)
- if current_week is not None:
- context.current_week = current_week
-
- self.db.commit()
- self.db.refresh(context)
- return context
-
- def complete_onboarding(self, teacher_id: str) -> TeacherContextDB:
- """Markiert Onboarding als abgeschlossen."""
- context = self.get_or_create(teacher_id)
- context.onboarding_completed = True
- context.macro_phase = MacroPhaseEnum.SCHULJAHRESSTART
- self.db.commit()
- self.db.refresh(context)
- return context
-
- def update_flags(
- self,
- teacher_id: str,
- has_classes: bool = None,
- has_schedule: bool = None,
- is_exam_period: bool = None,
- is_before_holidays: bool = None,
- ) -> TeacherContextDB:
- """Aktualisiert die Status-Flags eines Kontexts."""
- context = self.get_or_create(teacher_id)
-
- if has_classes is not None:
- context.has_classes = has_classes
- if has_schedule is not None:
- context.has_schedule = has_schedule
- if is_exam_period is not None:
- context.is_exam_period = is_exam_period
- if is_before_holidays is not None:
- context.is_before_holidays = is_before_holidays
-
- self.db.commit()
- self.db.refresh(context)
- return context
-
- def to_dict(self, context: TeacherContextDB) -> Dict[str, Any]:
- """Konvertiert DB-Model zu Dictionary."""
- return {
- "id": context.id,
- "teacher_id": context.teacher_id,
- "school": {
- "federal_state": context.federal_state,
- "federal_state_name": FEDERAL_STATES.get(context.federal_state, ""),
- "school_type": context.school_type,
- "school_type_name": SCHOOL_TYPES.get(context.school_type, ""),
- },
- "school_year": {
- "id": context.schoolyear,
- "start": context.schoolyear_start.isoformat() if context.schoolyear_start else None,
- "current_week": context.current_week,
- },
- "macro_phase": {
- "id": context.macro_phase.value,
- "label": self._get_phase_label(context.macro_phase),
- },
- "flags": {
- "onboarding_completed": context.onboarding_completed,
- "has_classes": context.has_classes,
- "has_schedule": context.has_schedule,
- "is_exam_period": context.is_exam_period,
- "is_before_holidays": context.is_before_holidays,
- },
- "created_at": context.created_at.isoformat() if context.created_at else None,
- "updated_at": context.updated_at.isoformat() if context.updated_at else None,
- }
-
- def _get_phase_label(self, phase: MacroPhaseEnum) -> str:
- """Gibt den Anzeigenamen einer Makro-Phase zurueck."""
- labels = {
- MacroPhaseEnum.ONBOARDING: "Einrichtung",
- MacroPhaseEnum.SCHULJAHRESSTART: "Schuljahresstart",
- MacroPhaseEnum.UNTERRICHTSAUFBAU: "Unterrichtsaufbau",
- MacroPhaseEnum.LEISTUNGSPHASE_1: "Leistungsphase 1",
- MacroPhaseEnum.HALBJAHRESABSCHLUSS: "Halbjahresabschluss",
- MacroPhaseEnum.LEISTUNGSPHASE_2: "Leistungsphase 2",
- MacroPhaseEnum.JAHRESABSCHLUSS: "Jahresabschluss",
- }
- return labels.get(phase, phase.value)
-
-
-# ==================== Phase 8: Schoolyear Event Repository ====================
-
-
-class SchoolyearEventRepository:
- """Repository fuer Schuljahr-Events (Phase 8)."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- def create(
- self,
- teacher_id: str,
- title: str,
- start_date: datetime,
- event_type: str = "other",
- end_date: datetime = None,
- class_id: str = None,
- subject: str = None,
- description: str = "",
- needs_preparation: bool = True,
- reminder_days_before: int = 7,
- extra_data: Dict[str, Any] = None,
- ) -> SchoolyearEventDB:
- """Erstellt ein neues Schuljahr-Event."""
- from uuid import uuid4
- event = SchoolyearEventDB(
- id=str(uuid4()),
- teacher_id=teacher_id,
- title=title,
- event_type=EventTypeEnum(event_type),
- start_date=start_date,
- end_date=end_date,
- class_id=class_id,
- subject=subject,
- description=description,
- needs_preparation=needs_preparation,
- reminder_days_before=reminder_days_before,
- extra_data=extra_data or {},
- )
- self.db.add(event)
- self.db.commit()
- self.db.refresh(event)
- return event
-
- def get_by_id(self, event_id: str) -> Optional[SchoolyearEventDB]:
- """Holt ein Event nach ID."""
- return self.db.query(SchoolyearEventDB).filter(
- SchoolyearEventDB.id == event_id
- ).first()
-
- def get_by_teacher(
- self,
- teacher_id: str,
- status: str = None,
- event_type: str = None,
- limit: int = 50,
- ) -> List[SchoolyearEventDB]:
- """Holt Events eines Lehrers."""
- query = self.db.query(SchoolyearEventDB).filter(
- SchoolyearEventDB.teacher_id == teacher_id
- )
- if status:
- query = query.filter(SchoolyearEventDB.status == EventStatusEnum(status))
- if event_type:
- query = query.filter(SchoolyearEventDB.event_type == EventTypeEnum(event_type))
-
- return query.order_by(SchoolyearEventDB.start_date).limit(limit).all()
-
- def get_upcoming(
- self,
- teacher_id: str,
- days: int = 30,
- limit: int = 10,
- ) -> List[SchoolyearEventDB]:
- """Holt anstehende Events der naechsten X Tage."""
- from datetime import timedelta
- now = datetime.utcnow()
- end = now + timedelta(days=days)
-
- return self.db.query(SchoolyearEventDB).filter(
- SchoolyearEventDB.teacher_id == teacher_id,
- SchoolyearEventDB.start_date >= now,
- SchoolyearEventDB.start_date <= end,
- SchoolyearEventDB.status != EventStatusEnum.CANCELLED,
- ).order_by(SchoolyearEventDB.start_date).limit(limit).all()
-
- def update_status(
- self,
- event_id: str,
- status: str,
- preparation_done: bool = None,
- ) -> Optional[SchoolyearEventDB]:
- """Aktualisiert den Status eines Events."""
- event = self.get_by_id(event_id)
- if not event:
- return None
-
- event.status = EventStatusEnum(status)
- if preparation_done is not None:
- event.preparation_done = preparation_done
-
- self.db.commit()
- self.db.refresh(event)
- return event
-
- def delete(self, event_id: str) -> bool:
- """Loescht ein Event."""
- event = self.get_by_id(event_id)
- if not event:
- return False
- self.db.delete(event)
- self.db.commit()
- return True
-
- def to_dict(self, event: SchoolyearEventDB) -> Dict[str, Any]:
- """Konvertiert DB-Model zu Dictionary."""
- return {
- "id": event.id,
- "teacher_id": event.teacher_id,
- "event_type": event.event_type.value,
- "title": event.title,
- "description": event.description,
- "start_date": event.start_date.isoformat() if event.start_date else None,
- "end_date": event.end_date.isoformat() if event.end_date else None,
- "class_id": event.class_id,
- "subject": event.subject,
- "status": event.status.value,
- "needs_preparation": event.needs_preparation,
- "preparation_done": event.preparation_done,
- "reminder_days_before": event.reminder_days_before,
- "extra_data": event.extra_data,
- "created_at": event.created_at.isoformat() if event.created_at else None,
- }
-
-
-# ==================== Phase 8: Recurring Routine Repository ====================
-
-
-class RecurringRoutineRepository:
- """Repository fuer wiederkehrende Routinen (Phase 8)."""
-
- def __init__(self, db: DBSession):
- self.db = db
-
- def create(
- self,
- teacher_id: str,
- title: str,
- routine_type: str = "other",
- recurrence_pattern: str = "weekly",
- day_of_week: int = None,
- day_of_month: int = None,
- time_of_day: str = None, # Format: "14:00"
- duration_minutes: int = 60,
- description: str = "",
- valid_from: datetime = None,
- valid_until: datetime = None,
- ) -> RecurringRoutineDB:
- """Erstellt eine neue wiederkehrende Routine."""
- from uuid import uuid4
- from datetime import time as dt_time
-
- time_obj = None
- if time_of_day:
- parts = time_of_day.split(":")
- time_obj = dt_time(int(parts[0]), int(parts[1]))
-
- routine = RecurringRoutineDB(
- id=str(uuid4()),
- teacher_id=teacher_id,
- title=title,
- routine_type=RoutineTypeEnum(routine_type),
- recurrence_pattern=RecurrencePatternEnum(recurrence_pattern),
- day_of_week=day_of_week,
- day_of_month=day_of_month,
- time_of_day=time_obj,
- duration_minutes=duration_minutes,
- description=description,
- valid_from=valid_from,
- valid_until=valid_until,
- )
- self.db.add(routine)
- self.db.commit()
- self.db.refresh(routine)
- return routine
-
- def get_by_id(self, routine_id: str) -> Optional[RecurringRoutineDB]:
- """Holt eine Routine nach ID."""
- return self.db.query(RecurringRoutineDB).filter(
- RecurringRoutineDB.id == routine_id
- ).first()
-
- def get_by_teacher(
- self,
- teacher_id: str,
- is_active: bool = True,
- routine_type: str = None,
- ) -> List[RecurringRoutineDB]:
- """Holt Routinen eines Lehrers."""
- query = self.db.query(RecurringRoutineDB).filter(
- RecurringRoutineDB.teacher_id == teacher_id
- )
- if is_active is not None:
- query = query.filter(RecurringRoutineDB.is_active == is_active)
- if routine_type:
- query = query.filter(RecurringRoutineDB.routine_type == RoutineTypeEnum(routine_type))
-
- return query.all()
-
- def get_today(self, teacher_id: str) -> List[RecurringRoutineDB]:
- """Holt Routinen die heute stattfinden."""
- today = datetime.utcnow()
- day_of_week = today.weekday() # 0 = Montag
- day_of_month = today.day
-
- routines = self.get_by_teacher(teacher_id, is_active=True)
- today_routines = []
-
- for routine in routines:
- if routine.recurrence_pattern == RecurrencePatternEnum.DAILY:
- today_routines.append(routine)
- elif routine.recurrence_pattern == RecurrencePatternEnum.WEEKLY:
- if routine.day_of_week == day_of_week:
- today_routines.append(routine)
- elif routine.recurrence_pattern == RecurrencePatternEnum.BIWEEKLY:
- # Vereinfacht: Pruefen ob Tag passt (echte Logik braucht Startdatum)
- if routine.day_of_week == day_of_week:
- today_routines.append(routine)
- elif routine.recurrence_pattern == RecurrencePatternEnum.MONTHLY:
- if routine.day_of_month == day_of_month:
- today_routines.append(routine)
-
- return today_routines
-
- def update(
- self,
- routine_id: str,
- title: str = None,
- is_active: bool = None,
- day_of_week: int = None,
- time_of_day: str = None,
- ) -> Optional[RecurringRoutineDB]:
- """Aktualisiert eine Routine."""
- routine = self.get_by_id(routine_id)
- if not routine:
- return None
-
- if title is not None:
- routine.title = title
- if is_active is not None:
- routine.is_active = is_active
- if day_of_week is not None:
- routine.day_of_week = day_of_week
- if time_of_day is not None:
- from datetime import time as dt_time
- parts = time_of_day.split(":")
- routine.time_of_day = dt_time(int(parts[0]), int(parts[1]))
-
- self.db.commit()
- self.db.refresh(routine)
- return routine
-
- def delete(self, routine_id: str) -> bool:
- """Loescht eine Routine."""
- routine = self.get_by_id(routine_id)
- if not routine:
- return False
- self.db.delete(routine)
- self.db.commit()
- return True
-
- def to_dict(self, routine: RecurringRoutineDB) -> Dict[str, Any]:
- """Konvertiert DB-Model zu Dictionary."""
- return {
- "id": routine.id,
- "teacher_id": routine.teacher_id,
- "routine_type": routine.routine_type.value,
- "title": routine.title,
- "description": routine.description,
- "recurrence_pattern": routine.recurrence_pattern.value,
- "day_of_week": routine.day_of_week,
- "day_of_month": routine.day_of_month,
- "time_of_day": routine.time_of_day.isoformat() if routine.time_of_day else None,
- "duration_minutes": routine.duration_minutes,
- "is_active": routine.is_active,
- "valid_from": routine.valid_from.isoformat() if routine.valid_from else None,
- "valid_until": routine.valid_until.isoformat() if routine.valid_until else None,
- "created_at": routine.created_at.isoformat() if routine.created_at else None,
- }
+__all__ = [
+ "SessionRepository",
+ "TeacherSettingsRepository",
+ "TemplateRepository",
+ "HomeworkRepository",
+ "MaterialRepository",
+ "ReflectionRepository",
+ "AnalyticsRepository",
+ "TeacherFeedbackRepository",
+ "TeacherContextRepository",
+ "SchoolyearEventRepository",
+ "RecurringRoutineRepository",
+]
diff --git a/backend-lehrer/classroom_engine/repository_context.py b/backend-lehrer/classroom_engine/repository_context.py
new file mode 100644
index 0000000..5bbfe42
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_context.py
@@ -0,0 +1,453 @@
+"""
+Teacher Context, Schoolyear Event & Recurring Routine Repositories.
+
+CRUD-Operationen fuer Schuljahres-Kontext (Phase 8).
+"""
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from sqlalchemy.orm import Session as DBSession
+
+from .context_models import (
+ TeacherContextDB, SchoolyearEventDB, RecurringRoutineDB,
+ MacroPhaseEnum, EventTypeEnum, EventStatusEnum,
+ RoutineTypeEnum, RecurrencePatternEnum,
+ FEDERAL_STATES, SCHOOL_TYPES,
+)
+
+
+class TeacherContextRepository:
+ """Repository fuer Lehrer-Kontext CRUD-Operationen (Phase 8)."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ # ==================== CREATE / GET-OR-CREATE ====================
+
+ def get_or_create(self, teacher_id: str) -> TeacherContextDB:
+ """
+ Holt den Kontext eines Lehrers oder erstellt einen neuen.
+
+ Args:
+ teacher_id: ID des Lehrers
+
+ Returns:
+ TeacherContextDB Model
+ """
+ context = self.get_by_teacher_id(teacher_id)
+ if context:
+ return context
+
+ # Neuen Kontext erstellen
+ from uuid import uuid4
+ context = TeacherContextDB(
+ id=str(uuid4()),
+ teacher_id=teacher_id,
+ macro_phase=MacroPhaseEnum.ONBOARDING,
+ )
+ self.db.add(context)
+ self.db.commit()
+ self.db.refresh(context)
+ return context
+
+ # ==================== READ ====================
+
+ def get_by_teacher_id(self, teacher_id: str) -> Optional[TeacherContextDB]:
+ """Holt den Kontext eines Lehrers."""
+ return self.db.query(TeacherContextDB).filter(
+ TeacherContextDB.teacher_id == teacher_id
+ ).first()
+
+ # ==================== UPDATE ====================
+
+ def update_context(
+ self,
+ teacher_id: str,
+ federal_state: str = None,
+ school_type: str = None,
+ schoolyear: str = None,
+ schoolyear_start: datetime = None,
+ macro_phase: str = None,
+ current_week: int = None,
+ ) -> Optional[TeacherContextDB]:
+ """Aktualisiert den Kontext eines Lehrers."""
+ context = self.get_or_create(teacher_id)
+
+ if federal_state is not None:
+ context.federal_state = federal_state
+ if school_type is not None:
+ context.school_type = school_type
+ if schoolyear is not None:
+ context.schoolyear = schoolyear
+ if schoolyear_start is not None:
+ context.schoolyear_start = schoolyear_start
+ if macro_phase is not None:
+ context.macro_phase = MacroPhaseEnum(macro_phase)
+ if current_week is not None:
+ context.current_week = current_week
+
+ self.db.commit()
+ self.db.refresh(context)
+ return context
+
+ def complete_onboarding(self, teacher_id: str) -> TeacherContextDB:
+ """Markiert Onboarding als abgeschlossen."""
+ context = self.get_or_create(teacher_id)
+ context.onboarding_completed = True
+ context.macro_phase = MacroPhaseEnum.SCHULJAHRESSTART
+ self.db.commit()
+ self.db.refresh(context)
+ return context
+
+ def update_flags(
+ self,
+ teacher_id: str,
+ has_classes: bool = None,
+ has_schedule: bool = None,
+ is_exam_period: bool = None,
+ is_before_holidays: bool = None,
+ ) -> TeacherContextDB:
+ """Aktualisiert die Status-Flags eines Kontexts."""
+ context = self.get_or_create(teacher_id)
+
+ if has_classes is not None:
+ context.has_classes = has_classes
+ if has_schedule is not None:
+ context.has_schedule = has_schedule
+ if is_exam_period is not None:
+ context.is_exam_period = is_exam_period
+ if is_before_holidays is not None:
+ context.is_before_holidays = is_before_holidays
+
+ self.db.commit()
+ self.db.refresh(context)
+ return context
+
+ def to_dict(self, context: TeacherContextDB) -> Dict[str, Any]:
+ """Konvertiert DB-Model zu Dictionary."""
+ return {
+ "id": context.id,
+ "teacher_id": context.teacher_id,
+ "school": {
+ "federal_state": context.federal_state,
+ "federal_state_name": FEDERAL_STATES.get(context.federal_state, ""),
+ "school_type": context.school_type,
+ "school_type_name": SCHOOL_TYPES.get(context.school_type, ""),
+ },
+ "school_year": {
+ "id": context.schoolyear,
+ "start": context.schoolyear_start.isoformat() if context.schoolyear_start else None,
+ "current_week": context.current_week,
+ },
+ "macro_phase": {
+ "id": context.macro_phase.value,
+ "label": self._get_phase_label(context.macro_phase),
+ },
+ "flags": {
+ "onboarding_completed": context.onboarding_completed,
+ "has_classes": context.has_classes,
+ "has_schedule": context.has_schedule,
+ "is_exam_period": context.is_exam_period,
+ "is_before_holidays": context.is_before_holidays,
+ },
+ "created_at": context.created_at.isoformat() if context.created_at else None,
+ "updated_at": context.updated_at.isoformat() if context.updated_at else None,
+ }
+
+ def _get_phase_label(self, phase: MacroPhaseEnum) -> str:
+ """Gibt den Anzeigenamen einer Makro-Phase zurueck."""
+ labels = {
+ MacroPhaseEnum.ONBOARDING: "Einrichtung",
+ MacroPhaseEnum.SCHULJAHRESSTART: "Schuljahresstart",
+ MacroPhaseEnum.UNTERRICHTSAUFBAU: "Unterrichtsaufbau",
+ MacroPhaseEnum.LEISTUNGSPHASE_1: "Leistungsphase 1",
+ MacroPhaseEnum.HALBJAHRESABSCHLUSS: "Halbjahresabschluss",
+ MacroPhaseEnum.LEISTUNGSPHASE_2: "Leistungsphase 2",
+ MacroPhaseEnum.JAHRESABSCHLUSS: "Jahresabschluss",
+ }
+ return labels.get(phase, phase.value)
+
+
+class SchoolyearEventRepository:
+ """Repository fuer Schuljahr-Events (Phase 8)."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ def create(
+ self,
+ teacher_id: str,
+ title: str,
+ start_date: datetime,
+ event_type: str = "other",
+ end_date: datetime = None,
+ class_id: str = None,
+ subject: str = None,
+ description: str = "",
+ needs_preparation: bool = True,
+ reminder_days_before: int = 7,
+ extra_data: Dict[str, Any] = None,
+ ) -> SchoolyearEventDB:
+ """Erstellt ein neues Schuljahr-Event."""
+ from uuid import uuid4
+ event = SchoolyearEventDB(
+ id=str(uuid4()),
+ teacher_id=teacher_id,
+ title=title,
+ event_type=EventTypeEnum(event_type),
+ start_date=start_date,
+ end_date=end_date,
+ class_id=class_id,
+ subject=subject,
+ description=description,
+ needs_preparation=needs_preparation,
+ reminder_days_before=reminder_days_before,
+ extra_data=extra_data or {},
+ )
+ self.db.add(event)
+ self.db.commit()
+ self.db.refresh(event)
+ return event
+
+ def get_by_id(self, event_id: str) -> Optional[SchoolyearEventDB]:
+ """Holt ein Event nach ID."""
+ return self.db.query(SchoolyearEventDB).filter(
+ SchoolyearEventDB.id == event_id
+ ).first()
+
+ def get_by_teacher(
+ self,
+ teacher_id: str,
+ status: str = None,
+ event_type: str = None,
+ limit: int = 50,
+ ) -> List[SchoolyearEventDB]:
+ """Holt Events eines Lehrers."""
+ query = self.db.query(SchoolyearEventDB).filter(
+ SchoolyearEventDB.teacher_id == teacher_id
+ )
+ if status:
+ query = query.filter(SchoolyearEventDB.status == EventStatusEnum(status))
+ if event_type:
+ query = query.filter(SchoolyearEventDB.event_type == EventTypeEnum(event_type))
+
+ return query.order_by(SchoolyearEventDB.start_date).limit(limit).all()
+
+ def get_upcoming(
+ self,
+ teacher_id: str,
+ days: int = 30,
+ limit: int = 10,
+ ) -> List[SchoolyearEventDB]:
+ """Holt anstehende Events der naechsten X Tage."""
+ from datetime import timedelta
+ now = datetime.utcnow()
+ end = now + timedelta(days=days)
+
+ return self.db.query(SchoolyearEventDB).filter(
+ SchoolyearEventDB.teacher_id == teacher_id,
+ SchoolyearEventDB.start_date >= now,
+ SchoolyearEventDB.start_date <= end,
+ SchoolyearEventDB.status != EventStatusEnum.CANCELLED,
+ ).order_by(SchoolyearEventDB.start_date).limit(limit).all()
+
+ def update_status(
+ self,
+ event_id: str,
+ status: str,
+ preparation_done: bool = None,
+ ) -> Optional[SchoolyearEventDB]:
+ """Aktualisiert den Status eines Events."""
+ event = self.get_by_id(event_id)
+ if not event:
+ return None
+
+ event.status = EventStatusEnum(status)
+ if preparation_done is not None:
+ event.preparation_done = preparation_done
+
+ self.db.commit()
+ self.db.refresh(event)
+ return event
+
+ def delete(self, event_id: str) -> bool:
+ """Loescht ein Event."""
+ event = self.get_by_id(event_id)
+ if not event:
+ return False
+ self.db.delete(event)
+ self.db.commit()
+ return True
+
+ def to_dict(self, event: SchoolyearEventDB) -> Dict[str, Any]:
+ """Konvertiert DB-Model zu Dictionary."""
+ return {
+ "id": event.id,
+ "teacher_id": event.teacher_id,
+ "event_type": event.event_type.value,
+ "title": event.title,
+ "description": event.description,
+ "start_date": event.start_date.isoformat() if event.start_date else None,
+ "end_date": event.end_date.isoformat() if event.end_date else None,
+ "class_id": event.class_id,
+ "subject": event.subject,
+ "status": event.status.value,
+ "needs_preparation": event.needs_preparation,
+ "preparation_done": event.preparation_done,
+ "reminder_days_before": event.reminder_days_before,
+ "extra_data": event.extra_data,
+ "created_at": event.created_at.isoformat() if event.created_at else None,
+ }
+
+
+class RecurringRoutineRepository:
+ """Repository fuer wiederkehrende Routinen (Phase 8)."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ def create(
+ self,
+ teacher_id: str,
+ title: str,
+ routine_type: str = "other",
+ recurrence_pattern: str = "weekly",
+ day_of_week: int = None,
+ day_of_month: int = None,
+ time_of_day: str = None, # Format: "14:00"
+ duration_minutes: int = 60,
+ description: str = "",
+ valid_from: datetime = None,
+ valid_until: datetime = None,
+ ) -> RecurringRoutineDB:
+ """Erstellt eine neue wiederkehrende Routine."""
+ from uuid import uuid4
+ from datetime import time as dt_time
+
+ time_obj = None
+ if time_of_day:
+ parts = time_of_day.split(":")
+ time_obj = dt_time(int(parts[0]), int(parts[1]))
+
+ routine = RecurringRoutineDB(
+ id=str(uuid4()),
+ teacher_id=teacher_id,
+ title=title,
+ routine_type=RoutineTypeEnum(routine_type),
+ recurrence_pattern=RecurrencePatternEnum(recurrence_pattern),
+ day_of_week=day_of_week,
+ day_of_month=day_of_month,
+ time_of_day=time_obj,
+ duration_minutes=duration_minutes,
+ description=description,
+ valid_from=valid_from,
+ valid_until=valid_until,
+ )
+ self.db.add(routine)
+ self.db.commit()
+ self.db.refresh(routine)
+ return routine
+
+ def get_by_id(self, routine_id: str) -> Optional[RecurringRoutineDB]:
+ """Holt eine Routine nach ID."""
+ return self.db.query(RecurringRoutineDB).filter(
+ RecurringRoutineDB.id == routine_id
+ ).first()
+
+ def get_by_teacher(
+ self,
+ teacher_id: str,
+ is_active: bool = True,
+ routine_type: str = None,
+ ) -> List[RecurringRoutineDB]:
+ """Holt Routinen eines Lehrers."""
+ query = self.db.query(RecurringRoutineDB).filter(
+ RecurringRoutineDB.teacher_id == teacher_id
+ )
+ if is_active is not None:
+ query = query.filter(RecurringRoutineDB.is_active == is_active)
+ if routine_type:
+ query = query.filter(RecurringRoutineDB.routine_type == RoutineTypeEnum(routine_type))
+
+ return query.all()
+
+ def get_today(self, teacher_id: str) -> List[RecurringRoutineDB]:
+ """Holt Routinen die heute stattfinden."""
+ today = datetime.utcnow()
+ day_of_week = today.weekday() # 0 = Montag
+ day_of_month = today.day
+
+ routines = self.get_by_teacher(teacher_id, is_active=True)
+ today_routines = []
+
+ for routine in routines:
+ if routine.recurrence_pattern == RecurrencePatternEnum.DAILY:
+ today_routines.append(routine)
+ elif routine.recurrence_pattern == RecurrencePatternEnum.WEEKLY:
+ if routine.day_of_week == day_of_week:
+ today_routines.append(routine)
+ elif routine.recurrence_pattern == RecurrencePatternEnum.BIWEEKLY:
+ # Vereinfacht: Pruefen ob Tag passt (echte Logik braucht Startdatum)
+ if routine.day_of_week == day_of_week:
+ today_routines.append(routine)
+ elif routine.recurrence_pattern == RecurrencePatternEnum.MONTHLY:
+ if routine.day_of_month == day_of_month:
+ today_routines.append(routine)
+
+ return today_routines
+
+ def update(
+ self,
+ routine_id: str,
+ title: str = None,
+ is_active: bool = None,
+ day_of_week: int = None,
+ time_of_day: str = None,
+ ) -> Optional[RecurringRoutineDB]:
+ """Aktualisiert eine Routine."""
+ routine = self.get_by_id(routine_id)
+ if not routine:
+ return None
+
+ if title is not None:
+ routine.title = title
+ if is_active is not None:
+ routine.is_active = is_active
+ if day_of_week is not None:
+ routine.day_of_week = day_of_week
+ if time_of_day is not None:
+ from datetime import time as dt_time
+ parts = time_of_day.split(":")
+ routine.time_of_day = dt_time(int(parts[0]), int(parts[1]))
+
+ self.db.commit()
+ self.db.refresh(routine)
+ return routine
+
+ def delete(self, routine_id: str) -> bool:
+ """Loescht eine Routine."""
+ routine = self.get_by_id(routine_id)
+ if not routine:
+ return False
+ self.db.delete(routine)
+ self.db.commit()
+ return True
+
+ def to_dict(self, routine: RecurringRoutineDB) -> Dict[str, Any]:
+ """Konvertiert DB-Model zu Dictionary."""
+ return {
+ "id": routine.id,
+ "teacher_id": routine.teacher_id,
+ "routine_type": routine.routine_type.value,
+ "title": routine.title,
+ "description": routine.description,
+ "recurrence_pattern": routine.recurrence_pattern.value,
+ "day_of_week": routine.day_of_week,
+ "day_of_month": routine.day_of_month,
+ "time_of_day": routine.time_of_day.isoformat() if routine.time_of_day else None,
+ "duration_minutes": routine.duration_minutes,
+ "is_active": routine.is_active,
+ "valid_from": routine.valid_from.isoformat() if routine.valid_from else None,
+ "valid_until": routine.valid_until.isoformat() if routine.valid_until else None,
+ "created_at": routine.created_at.isoformat() if routine.created_at else None,
+ }
diff --git a/backend-lehrer/classroom_engine/repository_feedback.py b/backend-lehrer/classroom_engine/repository_feedback.py
new file mode 100644
index 0000000..192b3e2
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_feedback.py
@@ -0,0 +1,182 @@
+"""
+Teacher Feedback Repository.
+
+CRUD-Operationen fuer Lehrer-Feedback (Phase 7).
+Ermoeglicht Lehrern, Bugs, Feature-Requests und Verbesserungen zu melden.
+"""
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import (
+ TeacherFeedbackDB, FeedbackTypeEnum, FeedbackStatusEnum,
+ FeedbackPriorityEnum,
+)
+
+
+class TeacherFeedbackRepository:
+ """
+ Repository fuer Lehrer-Feedback CRUD-Operationen.
+
+ Ermoeglicht Lehrern, Feedback (Bugs, Feature-Requests, Verbesserungen)
+ direkt aus dem Lehrer-Frontend zu senden.
+ """
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ def create(
+ self,
+ teacher_id: str,
+ title: str,
+ description: str,
+ feedback_type: str = "improvement",
+ priority: str = "medium",
+ teacher_name: str = "",
+ teacher_email: str = "",
+ context_url: str = "",
+ context_phase: str = "",
+ context_session_id: str = None,
+ user_agent: str = "",
+ related_feature: str = None,
+ ) -> TeacherFeedbackDB:
+ """Erstellt neues Feedback."""
+ import uuid
+
+ db_feedback = TeacherFeedbackDB(
+ id=str(uuid.uuid4()),
+ teacher_id=teacher_id,
+ teacher_name=teacher_name,
+ teacher_email=teacher_email,
+ title=title,
+ description=description,
+ feedback_type=FeedbackTypeEnum(feedback_type),
+ priority=FeedbackPriorityEnum(priority),
+ status=FeedbackStatusEnum.NEW,
+ related_feature=related_feature,
+ context_url=context_url,
+ context_phase=context_phase,
+ context_session_id=context_session_id,
+ user_agent=user_agent,
+ )
+
+ self.db.add(db_feedback)
+ self.db.commit()
+ self.db.refresh(db_feedback)
+ return db_feedback
+
+ def get_by_id(self, feedback_id: str) -> Optional[TeacherFeedbackDB]:
+ """Holt Feedback nach ID."""
+ return self.db.query(TeacherFeedbackDB).filter(
+ TeacherFeedbackDB.id == feedback_id
+ ).first()
+
+ def get_all(
+ self,
+ status: str = None,
+ feedback_type: str = None,
+ limit: int = 100,
+ offset: int = 0
+ ) -> List[TeacherFeedbackDB]:
+ """Holt alle Feedbacks mit optionalen Filtern."""
+ query = self.db.query(TeacherFeedbackDB)
+
+ if status:
+ query = query.filter(TeacherFeedbackDB.status == FeedbackStatusEnum(status))
+ if feedback_type:
+ query = query.filter(TeacherFeedbackDB.feedback_type == FeedbackTypeEnum(feedback_type))
+
+ return query.order_by(
+ TeacherFeedbackDB.created_at.desc()
+ ).offset(offset).limit(limit).all()
+
+ def get_by_teacher(self, teacher_id: str, limit: int = 50) -> List[TeacherFeedbackDB]:
+ """Holt Feedback eines bestimmten Lehrers."""
+ return self.db.query(TeacherFeedbackDB).filter(
+ TeacherFeedbackDB.teacher_id == teacher_id
+ ).order_by(
+ TeacherFeedbackDB.created_at.desc()
+ ).limit(limit).all()
+
+ def update_status(
+ self,
+ feedback_id: str,
+ status: str,
+ response: str = None,
+ responded_by: str = None
+ ) -> Optional[TeacherFeedbackDB]:
+ """Aktualisiert den Status eines Feedbacks."""
+ db_feedback = self.get_by_id(feedback_id)
+ if not db_feedback:
+ return None
+
+ db_feedback.status = FeedbackStatusEnum(status)
+ if response:
+ db_feedback.response = response
+ db_feedback.responded_at = datetime.utcnow()
+ db_feedback.responded_by = responded_by
+
+ self.db.commit()
+ self.db.refresh(db_feedback)
+ return db_feedback
+
+ def delete(self, feedback_id: str) -> bool:
+ """Loescht ein Feedback."""
+ db_feedback = self.get_by_id(feedback_id)
+ if not db_feedback:
+ return False
+
+ self.db.delete(db_feedback)
+ self.db.commit()
+ return True
+
+ def get_stats(self) -> Dict[str, Any]:
+ """Gibt Statistiken ueber alle Feedbacks zurueck."""
+ all_feedback = self.db.query(TeacherFeedbackDB).all()
+
+ stats = {
+ "total": len(all_feedback),
+ "by_status": {},
+ "by_type": {},
+ "by_priority": {},
+ }
+
+ for fb in all_feedback:
+ # By Status
+ status = fb.status.value
+ stats["by_status"][status] = stats["by_status"].get(status, 0) + 1
+
+ # By Type
+ fb_type = fb.feedback_type.value
+ stats["by_type"][fb_type] = stats["by_type"].get(fb_type, 0) + 1
+
+ # By Priority
+ priority = fb.priority.value
+ stats["by_priority"][priority] = stats["by_priority"].get(priority, 0) + 1
+
+ return stats
+
+ def to_dict(self, db_feedback: TeacherFeedbackDB) -> Dict[str, Any]:
+ """Konvertiert DB-Model zu Dictionary."""
+ return {
+ "id": db_feedback.id,
+ "teacher_id": db_feedback.teacher_id,
+ "teacher_name": db_feedback.teacher_name,
+ "teacher_email": db_feedback.teacher_email,
+ "title": db_feedback.title,
+ "description": db_feedback.description,
+ "feedback_type": db_feedback.feedback_type.value,
+ "priority": db_feedback.priority.value,
+ "status": db_feedback.status.value,
+ "related_feature": db_feedback.related_feature,
+ "context_url": db_feedback.context_url,
+ "context_phase": db_feedback.context_phase,
+ "context_session_id": db_feedback.context_session_id,
+ "user_agent": db_feedback.user_agent,
+ "response": db_feedback.response,
+ "responded_at": db_feedback.responded_at.isoformat() if db_feedback.responded_at else None,
+ "responded_by": db_feedback.responded_by,
+ "created_at": db_feedback.created_at.isoformat() if db_feedback.created_at else None,
+ "updated_at": db_feedback.updated_at.isoformat() if db_feedback.updated_at else None,
+ }
diff --git a/backend-lehrer/classroom_engine/repository_homework.py b/backend-lehrer/classroom_engine/repository_homework.py
new file mode 100644
index 0000000..25e07b9
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_homework.py
@@ -0,0 +1,382 @@
+"""
+Homework & Material Repositories.
+
+CRUD-Operationen fuer Hausaufgaben (Feature f20) und Phasen-Materialien (Feature f19).
+"""
+from datetime import datetime
+from typing import Optional, List
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import (
+ HomeworkDB, HomeworkStatusEnum, PhaseMaterialDB, MaterialTypeEnum,
+)
+from .models import (
+ Homework, HomeworkStatus, PhaseMaterial, MaterialType,
+)
+
+
+class HomeworkRepository:
+ """Repository fuer Hausaufgaben-Tracking (Feature f20)."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ # ==================== CREATE ====================
+
+ def create(self, homework: Homework) -> HomeworkDB:
+ """Erstellt eine neue Hausaufgabe."""
+ db_homework = HomeworkDB(
+ id=homework.homework_id,
+ teacher_id=homework.teacher_id,
+ class_id=homework.class_id,
+ subject=homework.subject,
+ title=homework.title,
+ description=homework.description,
+ session_id=homework.session_id,
+ due_date=homework.due_date,
+ status=HomeworkStatusEnum(homework.status.value),
+ )
+ self.db.add(db_homework)
+ self.db.commit()
+ self.db.refresh(db_homework)
+ return db_homework
+
+ # ==================== READ ====================
+
+ def get_by_id(self, homework_id: str) -> Optional[HomeworkDB]:
+ """Holt eine Hausaufgabe nach ID."""
+ return self.db.query(HomeworkDB).filter(
+ HomeworkDB.id == homework_id
+ ).first()
+
+ def get_by_teacher(
+ self,
+ teacher_id: str,
+ status: Optional[str] = None,
+ limit: int = 50
+ ) -> List[HomeworkDB]:
+ """Holt alle Hausaufgaben eines Lehrers."""
+ query = self.db.query(HomeworkDB).filter(
+ HomeworkDB.teacher_id == teacher_id
+ )
+ if status:
+ query = query.filter(HomeworkDB.status == HomeworkStatusEnum(status))
+ return query.order_by(
+ HomeworkDB.due_date.asc().nullslast(),
+ HomeworkDB.created_at.desc()
+ ).limit(limit).all()
+
+ def get_by_class(
+ self,
+ class_id: str,
+ teacher_id: str,
+ include_completed: bool = False,
+ limit: int = 20
+ ) -> List[HomeworkDB]:
+ """Holt alle Hausaufgaben einer Klasse."""
+ query = self.db.query(HomeworkDB).filter(
+ HomeworkDB.class_id == class_id,
+ HomeworkDB.teacher_id == teacher_id
+ )
+ if not include_completed:
+ query = query.filter(HomeworkDB.status != HomeworkStatusEnum.COMPLETED)
+ return query.order_by(
+ HomeworkDB.due_date.asc().nullslast(),
+ HomeworkDB.created_at.desc()
+ ).limit(limit).all()
+
+ def get_by_session(self, session_id: str) -> List[HomeworkDB]:
+ """Holt alle Hausaufgaben einer Session."""
+ return self.db.query(HomeworkDB).filter(
+ HomeworkDB.session_id == session_id
+ ).order_by(HomeworkDB.created_at.desc()).all()
+
+ def get_pending(
+ self,
+ teacher_id: str,
+ days_ahead: int = 7
+ ) -> List[HomeworkDB]:
+ """Holt anstehende Hausaufgaben der naechsten X Tage."""
+ from datetime import timedelta
+ cutoff = datetime.utcnow() + timedelta(days=days_ahead)
+ return self.db.query(HomeworkDB).filter(
+ HomeworkDB.teacher_id == teacher_id,
+ HomeworkDB.status.in_([HomeworkStatusEnum.ASSIGNED, HomeworkStatusEnum.IN_PROGRESS]),
+ HomeworkDB.due_date <= cutoff
+ ).order_by(HomeworkDB.due_date.asc()).all()
+
+ # ==================== UPDATE ====================
+
+ def update_status(
+ self,
+ homework_id: str,
+ status: HomeworkStatus
+ ) -> Optional[HomeworkDB]:
+ """Aktualisiert den Status einer Hausaufgabe."""
+ db_homework = self.get_by_id(homework_id)
+ if not db_homework:
+ return None
+
+ db_homework.status = HomeworkStatusEnum(status.value)
+ self.db.commit()
+ self.db.refresh(db_homework)
+ return db_homework
+
+ def update(self, homework: Homework) -> Optional[HomeworkDB]:
+ """Aktualisiert eine Hausaufgabe."""
+ db_homework = self.get_by_id(homework.homework_id)
+ if not db_homework:
+ return None
+
+ db_homework.title = homework.title
+ db_homework.description = homework.description
+ db_homework.due_date = homework.due_date
+ db_homework.status = HomeworkStatusEnum(homework.status.value)
+
+ self.db.commit()
+ self.db.refresh(db_homework)
+ return db_homework
+
+ # ==================== DELETE ====================
+
+ def delete(self, homework_id: str) -> bool:
+ """Loescht eine Hausaufgabe."""
+ db_homework = self.get_by_id(homework_id)
+ if not db_homework:
+ return False
+
+ self.db.delete(db_homework)
+ self.db.commit()
+ return True
+
+ # ==================== CONVERSION ====================
+
+ def to_dataclass(self, db_homework: HomeworkDB) -> Homework:
+ """Konvertiert DB-Model zu Dataclass."""
+ return Homework(
+ homework_id=db_homework.id,
+ teacher_id=db_homework.teacher_id,
+ class_id=db_homework.class_id,
+ subject=db_homework.subject,
+ title=db_homework.title,
+ description=db_homework.description or "",
+ session_id=db_homework.session_id,
+ due_date=db_homework.due_date,
+ status=HomeworkStatus(db_homework.status.value),
+ created_at=db_homework.created_at,
+ updated_at=db_homework.updated_at,
+ )
+
+
+class MaterialRepository:
+ """Repository fuer Phasen-Materialien (Feature f19)."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ # ==================== CREATE ====================
+
+ def create(self, material: PhaseMaterial) -> PhaseMaterialDB:
+ """Erstellt ein neues Material."""
+ db_material = PhaseMaterialDB(
+ id=material.material_id,
+ teacher_id=material.teacher_id,
+ title=material.title,
+ material_type=MaterialTypeEnum(material.material_type.value),
+ url=material.url,
+ description=material.description,
+ phase=material.phase,
+ subject=material.subject,
+ grade_level=material.grade_level,
+ tags=material.tags,
+ is_public=material.is_public,
+ usage_count=material.usage_count,
+ session_id=material.session_id,
+ )
+ self.db.add(db_material)
+ self.db.commit()
+ self.db.refresh(db_material)
+ return db_material
+
+ # ==================== READ ====================
+
+ def get_by_id(self, material_id: str) -> Optional[PhaseMaterialDB]:
+ """Holt ein Material nach ID."""
+ return self.db.query(PhaseMaterialDB).filter(
+ PhaseMaterialDB.id == material_id
+ ).first()
+
+ def get_by_teacher(
+ self,
+ teacher_id: str,
+ phase: Optional[str] = None,
+ subject: Optional[str] = None,
+ limit: int = 50
+ ) -> List[PhaseMaterialDB]:
+ """Holt alle Materialien eines Lehrers."""
+ query = self.db.query(PhaseMaterialDB).filter(
+ PhaseMaterialDB.teacher_id == teacher_id
+ )
+ if phase:
+ query = query.filter(PhaseMaterialDB.phase == phase)
+ if subject:
+ query = query.filter(PhaseMaterialDB.subject == subject)
+
+ return query.order_by(
+ PhaseMaterialDB.usage_count.desc(),
+ PhaseMaterialDB.created_at.desc()
+ ).limit(limit).all()
+
+ def get_by_phase(
+ self,
+ phase: str,
+ teacher_id: str,
+ include_public: bool = True
+ ) -> List[PhaseMaterialDB]:
+ """Holt alle Materialien fuer eine bestimmte Phase."""
+ if include_public:
+ return self.db.query(PhaseMaterialDB).filter(
+ PhaseMaterialDB.phase == phase,
+ (PhaseMaterialDB.teacher_id == teacher_id) |
+ (PhaseMaterialDB.is_public == True)
+ ).order_by(
+ PhaseMaterialDB.usage_count.desc()
+ ).all()
+ else:
+ return self.db.query(PhaseMaterialDB).filter(
+ PhaseMaterialDB.phase == phase,
+ PhaseMaterialDB.teacher_id == teacher_id
+ ).order_by(
+ PhaseMaterialDB.created_at.desc()
+ ).all()
+
+ def get_by_session(self, session_id: str) -> List[PhaseMaterialDB]:
+ """Holt alle Materialien einer Session."""
+ return self.db.query(PhaseMaterialDB).filter(
+ PhaseMaterialDB.session_id == session_id
+ ).order_by(PhaseMaterialDB.phase, PhaseMaterialDB.created_at).all()
+
+ def get_public_materials(
+ self,
+ phase: Optional[str] = None,
+ subject: Optional[str] = None,
+ limit: int = 20
+ ) -> List[PhaseMaterialDB]:
+ """Holt oeffentliche Materialien."""
+ query = self.db.query(PhaseMaterialDB).filter(
+ PhaseMaterialDB.is_public == True
+ )
+ if phase:
+ query = query.filter(PhaseMaterialDB.phase == phase)
+ if subject:
+ query = query.filter(PhaseMaterialDB.subject == subject)
+
+ return query.order_by(
+ PhaseMaterialDB.usage_count.desc()
+ ).limit(limit).all()
+
+ def search_by_tags(
+ self,
+ tags: List[str],
+ teacher_id: Optional[str] = None
+ ) -> List[PhaseMaterialDB]:
+ """Sucht Materialien nach Tags."""
+ query = self.db.query(PhaseMaterialDB)
+ if teacher_id:
+ query = query.filter(
+ (PhaseMaterialDB.teacher_id == teacher_id) |
+ (PhaseMaterialDB.is_public == True)
+ )
+ else:
+ query = query.filter(PhaseMaterialDB.is_public == True)
+
+ # Filter by tags - vereinfachte Implementierung
+ results = []
+ for material in query.all():
+ if material.tags and any(tag in material.tags for tag in tags):
+ results.append(material)
+ return results[:50]
+
+ # ==================== UPDATE ====================
+
+ def update(self, material: PhaseMaterial) -> Optional[PhaseMaterialDB]:
+ """Aktualisiert ein Material."""
+ db_material = self.get_by_id(material.material_id)
+ if not db_material:
+ return None
+
+ db_material.title = material.title
+ db_material.material_type = MaterialTypeEnum(material.material_type.value)
+ db_material.url = material.url
+ db_material.description = material.description
+ db_material.phase = material.phase
+ db_material.subject = material.subject
+ db_material.grade_level = material.grade_level
+ db_material.tags = material.tags
+ db_material.is_public = material.is_public
+
+ self.db.commit()
+ self.db.refresh(db_material)
+ return db_material
+
+ def increment_usage(self, material_id: str) -> Optional[PhaseMaterialDB]:
+ """Erhoeht den Usage-Counter eines Materials."""
+ db_material = self.get_by_id(material_id)
+ if not db_material:
+ return None
+
+ db_material.usage_count += 1
+ self.db.commit()
+ self.db.refresh(db_material)
+ return db_material
+
+ def attach_to_session(
+ self,
+ material_id: str,
+ session_id: str
+ ) -> Optional[PhaseMaterialDB]:
+ """Verknuepft ein Material mit einer Session."""
+ db_material = self.get_by_id(material_id)
+ if not db_material:
+ return None
+
+ db_material.session_id = session_id
+ db_material.usage_count += 1
+ self.db.commit()
+ self.db.refresh(db_material)
+ return db_material
+
+ # ==================== DELETE ====================
+
+ def delete(self, material_id: str) -> bool:
+ """Loescht ein Material."""
+ db_material = self.get_by_id(material_id)
+ if not db_material:
+ return False
+
+ self.db.delete(db_material)
+ self.db.commit()
+ return True
+
+ # ==================== CONVERSION ====================
+
+ def to_dataclass(self, db_material: PhaseMaterialDB) -> PhaseMaterial:
+ """Konvertiert DB-Model zu Dataclass."""
+ return PhaseMaterial(
+ material_id=db_material.id,
+ teacher_id=db_material.teacher_id,
+ title=db_material.title,
+ material_type=MaterialType(db_material.material_type.value),
+ url=db_material.url,
+ description=db_material.description or "",
+ phase=db_material.phase,
+ subject=db_material.subject or "",
+ grade_level=db_material.grade_level or "",
+ tags=db_material.tags or [],
+ is_public=db_material.is_public,
+ usage_count=db_material.usage_count,
+ session_id=db_material.session_id,
+ created_at=db_material.created_at,
+ updated_at=db_material.updated_at,
+ )
diff --git a/backend-lehrer/classroom_engine/repository_reflection.py b/backend-lehrer/classroom_engine/repository_reflection.py
new file mode 100644
index 0000000..159fb5f
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_reflection.py
@@ -0,0 +1,315 @@
+"""
+Reflection & Analytics Repositories.
+
+CRUD-Operationen fuer Lesson-Reflections und Analytics-Abfragen (Phase 5).
+"""
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import LessonSessionDB, LessonPhaseEnum, LessonReflectionDB
+from .analytics import (
+ LessonReflection, SessionSummary, TeacherAnalytics, AnalyticsCalculator,
+)
+
+
+class ReflectionRepository:
+ """Repository fuer LessonReflection CRUD-Operationen."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ # ==================== CREATE ====================
+
+ def create(self, reflection: LessonReflection) -> LessonReflectionDB:
+ """Erstellt eine neue Reflection."""
+ db_reflection = LessonReflectionDB(
+ id=reflection.reflection_id,
+ session_id=reflection.session_id,
+ teacher_id=reflection.teacher_id,
+ notes=reflection.notes,
+ overall_rating=reflection.overall_rating,
+ what_worked=reflection.what_worked,
+ improvements=reflection.improvements,
+ notes_for_next_lesson=reflection.notes_for_next_lesson,
+ )
+ self.db.add(db_reflection)
+ self.db.commit()
+ self.db.refresh(db_reflection)
+ return db_reflection
+
+ # ==================== READ ====================
+
+ def get_by_id(self, reflection_id: str) -> Optional[LessonReflectionDB]:
+ """Holt eine Reflection nach ID."""
+ return self.db.query(LessonReflectionDB).filter(
+ LessonReflectionDB.id == reflection_id
+ ).first()
+
+ def get_by_session(self, session_id: str) -> Optional[LessonReflectionDB]:
+ """Holt die Reflection einer Session."""
+ return self.db.query(LessonReflectionDB).filter(
+ LessonReflectionDB.session_id == session_id
+ ).first()
+
+ def get_by_teacher(
+ self,
+ teacher_id: str,
+ limit: int = 20,
+ offset: int = 0
+ ) -> List[LessonReflectionDB]:
+ """Holt alle Reflections eines Lehrers."""
+ return self.db.query(LessonReflectionDB).filter(
+ LessonReflectionDB.teacher_id == teacher_id
+ ).order_by(
+ LessonReflectionDB.created_at.desc()
+ ).offset(offset).limit(limit).all()
+
+ # ==================== UPDATE ====================
+
+ def update(self, reflection: LessonReflection) -> Optional[LessonReflectionDB]:
+ """Aktualisiert eine Reflection."""
+ db_reflection = self.get_by_id(reflection.reflection_id)
+ if not db_reflection:
+ return None
+
+ db_reflection.notes = reflection.notes
+ db_reflection.overall_rating = reflection.overall_rating
+ db_reflection.what_worked = reflection.what_worked
+ db_reflection.improvements = reflection.improvements
+ db_reflection.notes_for_next_lesson = reflection.notes_for_next_lesson
+
+ self.db.commit()
+ self.db.refresh(db_reflection)
+ return db_reflection
+
+ # ==================== DELETE ====================
+
+ def delete(self, reflection_id: str) -> bool:
+ """Loescht eine Reflection."""
+ db_reflection = self.get_by_id(reflection_id)
+ if not db_reflection:
+ return False
+
+ self.db.delete(db_reflection)
+ self.db.commit()
+ return True
+
+ # ==================== CONVERSION ====================
+
+ def to_dataclass(self, db_reflection: LessonReflectionDB) -> LessonReflection:
+ """Konvertiert DB-Model zu Dataclass."""
+ return LessonReflection(
+ reflection_id=db_reflection.id,
+ session_id=db_reflection.session_id,
+ teacher_id=db_reflection.teacher_id,
+ notes=db_reflection.notes or "",
+ overall_rating=db_reflection.overall_rating,
+ what_worked=db_reflection.what_worked or [],
+ improvements=db_reflection.improvements or [],
+ notes_for_next_lesson=db_reflection.notes_for_next_lesson or "",
+ created_at=db_reflection.created_at,
+ updated_at=db_reflection.updated_at,
+ )
+
+
+class AnalyticsRepository:
+ """Repository fuer Analytics-Abfragen."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ def get_session_summary(self, session_id: str) -> Optional[SessionSummary]:
+ """
+ Berechnet die Summary einer abgeschlossenen Session.
+
+ Args:
+ session_id: ID der Session
+
+ Returns:
+ SessionSummary oder None wenn Session nicht gefunden
+ """
+ db_session = self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.id == session_id
+ ).first()
+
+ if not db_session:
+ return None
+
+ # Session-Daten zusammenstellen
+ session_data = {
+ "session_id": db_session.id,
+ "teacher_id": db_session.teacher_id,
+ "class_id": db_session.class_id,
+ "subject": db_session.subject,
+ "topic": db_session.topic,
+ "lesson_started_at": db_session.lesson_started_at,
+ "lesson_ended_at": db_session.lesson_ended_at,
+ "phase_durations": db_session.phase_durations or {},
+ }
+
+ # Phase History aus DB oder JSON
+ phase_history = db_session.phase_history or []
+
+ # Summary berechnen
+ return AnalyticsCalculator.calculate_session_summary(
+ session_data, phase_history
+ )
+
+ def get_teacher_analytics(
+ self,
+ teacher_id: str,
+ period_start: Optional[datetime] = None,
+ period_end: Optional[datetime] = None
+ ) -> TeacherAnalytics:
+ """
+ Berechnet aggregierte Statistiken fuer einen Lehrer.
+
+ Args:
+ teacher_id: ID des Lehrers
+ period_start: Beginn des Zeitraums (default: 30 Tage zurueck)
+ period_end: Ende des Zeitraums (default: jetzt)
+
+ Returns:
+ TeacherAnalytics mit aggregierten Statistiken
+ """
+ from datetime import timedelta
+
+ if not period_end:
+ period_end = datetime.utcnow()
+ if not period_start:
+ period_start = period_end - timedelta(days=30)
+
+ # Sessions im Zeitraum abfragen
+ sessions_query = self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.teacher_id == teacher_id,
+ LessonSessionDB.lesson_started_at >= period_start,
+ LessonSessionDB.lesson_started_at <= period_end
+ ).all()
+
+ # Sessions zu Dictionaries konvertieren
+ sessions_data = []
+ for db_session in sessions_query:
+ sessions_data.append({
+ "session_id": db_session.id,
+ "teacher_id": db_session.teacher_id,
+ "class_id": db_session.class_id,
+ "subject": db_session.subject,
+ "topic": db_session.topic,
+ "lesson_started_at": db_session.lesson_started_at,
+ "lesson_ended_at": db_session.lesson_ended_at,
+ "phase_durations": db_session.phase_durations or {},
+ "phase_history": db_session.phase_history or [],
+ })
+
+ return AnalyticsCalculator.calculate_teacher_analytics(
+ sessions_data, period_start, period_end
+ )
+
+ def get_phase_duration_trends(
+ self,
+ teacher_id: str,
+ phase: str,
+ limit: int = 20
+ ) -> List[Dict[str, Any]]:
+ """
+ Gibt die Dauer-Trends fuer eine bestimmte Phase zurueck.
+
+ Args:
+ teacher_id: ID des Lehrers
+ phase: Phasen-ID (einstieg, erarbeitung, etc.)
+ limit: Max Anzahl der Datenpunkte
+
+ Returns:
+ Liste von Datenpunkten [{date, planned, actual, difference}]
+ """
+ sessions = self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.teacher_id == teacher_id,
+ LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
+ ).order_by(
+ LessonSessionDB.lesson_ended_at.desc()
+ ).limit(limit).all()
+
+ trends = []
+ for db_session in sessions:
+ history = db_session.phase_history or []
+ for entry in history:
+ if entry.get("phase") == phase:
+ planned = (db_session.phase_durations or {}).get(phase, 0) * 60
+ actual = entry.get("duration_seconds", 0) or 0
+ trends.append({
+ "date": db_session.lesson_started_at.isoformat() if db_session.lesson_started_at else None,
+ "session_id": db_session.id,
+ "subject": db_session.subject,
+ "planned_seconds": planned,
+ "actual_seconds": actual,
+ "difference_seconds": actual - planned,
+ })
+ break
+
+ return list(reversed(trends)) # Chronologisch sortieren
+
+ def get_overtime_analysis(
+ self,
+ teacher_id: str,
+ limit: int = 30
+ ) -> Dict[str, Any]:
+ """
+ Analysiert Overtime-Muster.
+
+ Args:
+ teacher_id: ID des Lehrers
+ limit: Anzahl der zu analysierenden Sessions
+
+ Returns:
+ Dict mit Overtime-Statistiken pro Phase
+ """
+ sessions = self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.teacher_id == teacher_id,
+ LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
+ ).order_by(
+ LessonSessionDB.lesson_ended_at.desc()
+ ).limit(limit).all()
+
+ phase_overtime: Dict[str, List[int]] = {
+ "einstieg": [],
+ "erarbeitung": [],
+ "sicherung": [],
+ "transfer": [],
+ "reflexion": [],
+ }
+
+ for db_session in sessions:
+ history = db_session.phase_history or []
+ phase_durations = db_session.phase_durations or {}
+
+ for entry in history:
+ phase = entry.get("phase", "")
+ if phase in phase_overtime:
+ planned = phase_durations.get(phase, 0) * 60
+ actual = entry.get("duration_seconds", 0) or 0
+ overtime = max(0, actual - planned)
+ phase_overtime[phase].append(overtime)
+
+ # Statistiken berechnen
+ result = {}
+ for phase, overtimes in phase_overtime.items():
+ if overtimes:
+ result[phase] = {
+ "count": len([o for o in overtimes if o > 0]),
+ "total": len(overtimes),
+ "avg_overtime_seconds": sum(overtimes) / len(overtimes),
+ "max_overtime_seconds": max(overtimes),
+ "overtime_percentage": len([o for o in overtimes if o > 0]) / len(overtimes) * 100,
+ }
+ else:
+ result[phase] = {
+ "count": 0,
+ "total": 0,
+ "avg_overtime_seconds": 0,
+ "max_overtime_seconds": 0,
+ "overtime_percentage": 0,
+ }
+
+ return result
diff --git a/backend-lehrer/classroom_engine/repository_session.py b/backend-lehrer/classroom_engine/repository_session.py
new file mode 100644
index 0000000..1165d33
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_session.py
@@ -0,0 +1,248 @@
+"""
+Session & Teacher Settings Repositories.
+
+CRUD-Operationen fuer LessonSessions und Lehrer-Einstellungen.
+"""
+from typing import Optional, List, Dict
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import (
+ LessonSessionDB, LessonPhaseEnum, TeacherSettingsDB,
+)
+from .models import (
+ LessonSession, LessonPhase, get_default_durations,
+)
+
+
+class SessionRepository:
+ """Repository fuer LessonSession CRUD-Operationen."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ # ==================== CREATE ====================
+
+ def create(self, session: LessonSession) -> LessonSessionDB:
+ """
+ Erstellt eine neue Session in der Datenbank.
+
+ Args:
+ session: LessonSession Dataclass
+
+ Returns:
+ LessonSessionDB Model
+ """
+ db_session = LessonSessionDB(
+ id=session.session_id,
+ teacher_id=session.teacher_id,
+ class_id=session.class_id,
+ subject=session.subject,
+ topic=session.topic,
+ current_phase=LessonPhaseEnum(session.current_phase.value),
+ is_paused=session.is_paused,
+ lesson_started_at=session.lesson_started_at,
+ lesson_ended_at=session.lesson_ended_at,
+ phase_started_at=session.phase_started_at,
+ pause_started_at=session.pause_started_at,
+ total_paused_seconds=session.total_paused_seconds,
+ phase_durations=session.phase_durations,
+ phase_history=session.phase_history,
+ notes=session.notes,
+ homework=session.homework,
+ )
+ self.db.add(db_session)
+ self.db.commit()
+ self.db.refresh(db_session)
+ return db_session
+
+ # ==================== READ ====================
+
+ def get_by_id(self, session_id: str) -> Optional[LessonSessionDB]:
+ """Holt eine Session nach ID."""
+ return self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.id == session_id
+ ).first()
+
+ def get_active_by_teacher(self, teacher_id: str) -> List[LessonSessionDB]:
+ """Holt alle aktiven Sessions eines Lehrers."""
+ return self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.teacher_id == teacher_id,
+ LessonSessionDB.current_phase != LessonPhaseEnum.ENDED
+ ).all()
+
+ def get_history_by_teacher(
+ self,
+ teacher_id: str,
+ limit: int = 20,
+ offset: int = 0
+ ) -> List[LessonSessionDB]:
+ """Holt Session-History eines Lehrers (Feature f17)."""
+ return self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.teacher_id == teacher_id,
+ LessonSessionDB.current_phase == LessonPhaseEnum.ENDED
+ ).order_by(
+ LessonSessionDB.lesson_ended_at.desc()
+ ).offset(offset).limit(limit).all()
+
+ def get_by_class(
+ self,
+ class_id: str,
+ limit: int = 20
+ ) -> List[LessonSessionDB]:
+ """Holt Sessions einer Klasse."""
+ return self.db.query(LessonSessionDB).filter(
+ LessonSessionDB.class_id == class_id
+ ).order_by(
+ LessonSessionDB.created_at.desc()
+ ).limit(limit).all()
+
+ # ==================== UPDATE ====================
+
+ def update(self, session: LessonSession) -> Optional[LessonSessionDB]:
+ """
+ Aktualisiert eine bestehende Session.
+
+ Args:
+ session: LessonSession Dataclass mit aktualisierten Werten
+
+ Returns:
+ Aktualisierte LessonSessionDB oder None
+ """
+ db_session = self.get_by_id(session.session_id)
+ if not db_session:
+ return None
+
+ db_session.current_phase = LessonPhaseEnum(session.current_phase.value)
+ db_session.is_paused = session.is_paused
+ db_session.lesson_started_at = session.lesson_started_at
+ db_session.lesson_ended_at = session.lesson_ended_at
+ db_session.phase_started_at = session.phase_started_at
+ db_session.pause_started_at = session.pause_started_at
+ db_session.total_paused_seconds = session.total_paused_seconds
+ db_session.phase_durations = session.phase_durations
+ db_session.phase_history = session.phase_history
+ db_session.notes = session.notes
+ db_session.homework = session.homework
+
+ self.db.commit()
+ self.db.refresh(db_session)
+ return db_session
+
+ def update_notes(
+ self,
+ session_id: str,
+ notes: str,
+ homework: str
+ ) -> Optional[LessonSessionDB]:
+ """Aktualisiert nur Notizen und Hausaufgaben."""
+ db_session = self.get_by_id(session_id)
+ if not db_session:
+ return None
+
+ db_session.notes = notes
+ db_session.homework = homework
+
+ self.db.commit()
+ self.db.refresh(db_session)
+ return db_session
+
+ # ==================== DELETE ====================
+
+ def delete(self, session_id: str) -> bool:
+ """Loescht eine Session."""
+ db_session = self.get_by_id(session_id)
+ if not db_session:
+ return False
+
+ self.db.delete(db_session)
+ self.db.commit()
+ return True
+
+ # ==================== CONVERSION ====================
+
+ def to_dataclass(self, db_session: LessonSessionDB) -> LessonSession:
+ """
+ Konvertiert DB-Model zu Dataclass.
+
+ Args:
+ db_session: LessonSessionDB Model
+
+ Returns:
+ LessonSession Dataclass
+ """
+ return LessonSession(
+ session_id=db_session.id,
+ teacher_id=db_session.teacher_id,
+ class_id=db_session.class_id,
+ subject=db_session.subject,
+ topic=db_session.topic,
+ current_phase=LessonPhase(db_session.current_phase.value),
+ phase_started_at=db_session.phase_started_at,
+ lesson_started_at=db_session.lesson_started_at,
+ lesson_ended_at=db_session.lesson_ended_at,
+ is_paused=db_session.is_paused,
+ pause_started_at=db_session.pause_started_at,
+ total_paused_seconds=db_session.total_paused_seconds or 0,
+ phase_durations=db_session.phase_durations or get_default_durations(),
+ phase_history=db_session.phase_history or [],
+ notes=db_session.notes or "",
+ homework=db_session.homework or "",
+ )
+
+
+class TeacherSettingsRepository:
+ """Repository fuer Lehrer-Einstellungen (Feature f16)."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ def get_or_create(self, teacher_id: str) -> TeacherSettingsDB:
+ """Holt oder erstellt Einstellungen fuer einen Lehrer."""
+ settings = self.db.query(TeacherSettingsDB).filter(
+ TeacherSettingsDB.teacher_id == teacher_id
+ ).first()
+
+ if not settings:
+ settings = TeacherSettingsDB(
+ teacher_id=teacher_id,
+ default_phase_durations=get_default_durations(),
+ )
+ self.db.add(settings)
+ self.db.commit()
+ self.db.refresh(settings)
+
+ return settings
+
+ def update_phase_durations(
+ self,
+ teacher_id: str,
+ durations: Dict[str, int]
+ ) -> TeacherSettingsDB:
+ """Aktualisiert die Standard-Phasendauern."""
+ settings = self.get_or_create(teacher_id)
+ settings.default_phase_durations = durations
+ self.db.commit()
+ self.db.refresh(settings)
+ return settings
+
+ def update_preferences(
+ self,
+ teacher_id: str,
+ audio_enabled: Optional[bool] = None,
+ high_contrast: Optional[bool] = None,
+ show_statistics: Optional[bool] = None
+ ) -> TeacherSettingsDB:
+ """Aktualisiert UI-Praeferenzen."""
+ settings = self.get_or_create(teacher_id)
+
+ if audio_enabled is not None:
+ settings.audio_enabled = audio_enabled
+ if high_contrast is not None:
+ settings.high_contrast = high_contrast
+ if show_statistics is not None:
+ settings.show_statistics = show_statistics
+
+ self.db.commit()
+ self.db.refresh(settings)
+ return settings
diff --git a/backend-lehrer/classroom_engine/repository_template.py b/backend-lehrer/classroom_engine/repository_template.py
new file mode 100644
index 0000000..e97c16d
--- /dev/null
+++ b/backend-lehrer/classroom_engine/repository_template.py
@@ -0,0 +1,167 @@
+"""
+Template Repository.
+
+CRUD-Operationen fuer Stunden-Vorlagen (Feature f37).
+"""
+from typing import Optional, List
+
+from sqlalchemy.orm import Session as DBSession
+
+from .db_models import LessonTemplateDB
+from .models import LessonTemplate, get_default_durations
+
+
+class TemplateRepository:
+ """Repository fuer Stunden-Vorlagen (Feature f37)."""
+
+ def __init__(self, db: DBSession):
+ self.db = db
+
+ # ==================== CREATE ====================
+
+ def create(self, template: LessonTemplate) -> LessonTemplateDB:
+ """Erstellt eine neue Vorlage."""
+ db_template = LessonTemplateDB(
+ id=template.template_id,
+ teacher_id=template.teacher_id,
+ name=template.name,
+ description=template.description,
+ subject=template.subject,
+ grade_level=template.grade_level,
+ phase_durations=template.phase_durations,
+ default_topic=template.default_topic,
+ default_notes=template.default_notes,
+ is_public=template.is_public,
+ usage_count=template.usage_count,
+ )
+ self.db.add(db_template)
+ self.db.commit()
+ self.db.refresh(db_template)
+ return db_template
+
+ # ==================== READ ====================
+
+ def get_by_id(self, template_id: str) -> Optional[LessonTemplateDB]:
+ """Holt eine Vorlage nach ID."""
+ return self.db.query(LessonTemplateDB).filter(
+ LessonTemplateDB.id == template_id
+ ).first()
+
+ def get_by_teacher(
+ self,
+ teacher_id: str,
+ include_public: bool = True
+ ) -> List[LessonTemplateDB]:
+ """
+ Holt alle Vorlagen eines Lehrers.
+
+ Args:
+ teacher_id: ID des Lehrers
+ include_public: Auch oeffentliche Vorlagen anderer Lehrer einbeziehen
+ """
+ if include_public:
+ return self.db.query(LessonTemplateDB).filter(
+ (LessonTemplateDB.teacher_id == teacher_id) |
+ (LessonTemplateDB.is_public == True)
+ ).order_by(
+ LessonTemplateDB.usage_count.desc()
+ ).all()
+ else:
+ return self.db.query(LessonTemplateDB).filter(
+ LessonTemplateDB.teacher_id == teacher_id
+ ).order_by(
+ LessonTemplateDB.created_at.desc()
+ ).all()
+
+ def get_public_templates(self, limit: int = 20) -> List[LessonTemplateDB]:
+ """Holt oeffentliche Vorlagen, sortiert nach Beliebtheit."""
+ return self.db.query(LessonTemplateDB).filter(
+ LessonTemplateDB.is_public == True
+ ).order_by(
+ LessonTemplateDB.usage_count.desc()
+ ).limit(limit).all()
+
+ def get_by_subject(
+ self,
+ subject: str,
+ teacher_id: Optional[str] = None
+ ) -> List[LessonTemplateDB]:
+ """Holt Vorlagen fuer ein bestimmtes Fach."""
+ query = self.db.query(LessonTemplateDB).filter(
+ LessonTemplateDB.subject == subject
+ )
+ if teacher_id:
+ query = query.filter(
+ (LessonTemplateDB.teacher_id == teacher_id) |
+ (LessonTemplateDB.is_public == True)
+ )
+ else:
+ query = query.filter(LessonTemplateDB.is_public == True)
+
+ return query.order_by(
+ LessonTemplateDB.usage_count.desc()
+ ).all()
+
+ # ==================== UPDATE ====================
+
+ def update(self, template: LessonTemplate) -> Optional[LessonTemplateDB]:
+ """Aktualisiert eine Vorlage."""
+ db_template = self.get_by_id(template.template_id)
+ if not db_template:
+ return None
+
+ db_template.name = template.name
+ db_template.description = template.description
+ db_template.subject = template.subject
+ db_template.grade_level = template.grade_level
+ db_template.phase_durations = template.phase_durations
+ db_template.default_topic = template.default_topic
+ db_template.default_notes = template.default_notes
+ db_template.is_public = template.is_public
+
+ self.db.commit()
+ self.db.refresh(db_template)
+ return db_template
+
+ def increment_usage(self, template_id: str) -> Optional[LessonTemplateDB]:
+ """Erhoeht den Usage-Counter einer Vorlage."""
+ db_template = self.get_by_id(template_id)
+ if not db_template:
+ return None
+
+ db_template.usage_count += 1
+ self.db.commit()
+ self.db.refresh(db_template)
+ return db_template
+
+ # ==================== DELETE ====================
+
+ def delete(self, template_id: str) -> bool:
+ """Loescht eine Vorlage."""
+ db_template = self.get_by_id(template_id)
+ if not db_template:
+ return False
+
+ self.db.delete(db_template)
+ self.db.commit()
+ return True
+
+ # ==================== CONVERSION ====================
+
+ def to_dataclass(self, db_template: LessonTemplateDB) -> LessonTemplate:
+ """Konvertiert DB-Model zu Dataclass."""
+ return LessonTemplate(
+ template_id=db_template.id,
+ teacher_id=db_template.teacher_id,
+ name=db_template.name,
+ description=db_template.description or "",
+ subject=db_template.subject or "",
+ grade_level=db_template.grade_level or "",
+ phase_durations=db_template.phase_durations or get_default_durations(),
+ default_topic=db_template.default_topic or "",
+ default_notes=db_template.default_notes or "",
+ is_public=db_template.is_public,
+ usage_count=db_template.usage_count,
+ created_at=db_template.created_at,
+ updated_at=db_template.updated_at,
+ )
diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py
index 56fd472..466565e 100644
--- a/klausur-service/backend/cv_cell_grid.py
+++ b/klausur-service/backend/cv_cell_grid.py
@@ -1,1675 +1,60 @@
"""
Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
+Re-export hub — all public and private names remain importable from here
+for backward compatibility. The actual implementations live in:
+
+ cv_cell_grid_helpers.py — shared helpers (_heal_row_gaps, _is_artifact_row, ...)
+ cv_cell_grid_build.py — v2 hybrid grid (build_cell_grid_v2, _ocr_cell_crop)
+ cv_cell_grid_legacy.py — deprecated v1 grid (build_cell_grid, _ocr_single_cell)
+ cv_cell_grid_streaming.py — streaming variants (build_cell_grid_v2_streaming, ...)
+ cv_cell_grid_merge.py — row-merging logic (_merge_wrapped_rows, ...)
+ cv_cell_grid_vocab.py — vocab extraction (_cells_to_vocab_entries, build_word_grid)
+
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
-import logging
-import re
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, Generator, List, Optional, Tuple
-
-import numpy as np
-
-from cv_vocab_types import PageRegion, RowGeometry
-from cv_ocr_engines import (
- RAPIDOCR_AVAILABLE,
- _RE_ALPHA,
- _assign_row_words_to_columns,
- _attach_example_sentences,
- _clean_cell_text,
- _clean_cell_text_lite,
- _fix_phonetic_brackets,
- _split_comma_entries,
- _words_to_reading_order_text,
- _words_to_spaced_text,
- ocr_region_lighton,
- ocr_region_rapid,
- ocr_region_trocr,
+# --- Helpers ---
+from cv_cell_grid_helpers import ( # noqa: F401
+ _MIN_WORD_CONF,
+ _compute_cell_padding,
+ _ensure_minimum_crop_size,
+ _heal_row_gaps,
+ _is_artifact_row,
+ _select_psm_for_column,
)
-logger = logging.getLogger(__name__)
-
-try:
- import cv2
-except ImportError:
- cv2 = None # type: ignore[assignment]
-
-try:
- from PIL import Image
-except ImportError:
- Image = None # type: ignore[assignment,misc]
-
-# Minimum OCR word confidence to keep (used across multiple functions)
-_MIN_WORD_CONF = 30
-
-# ---------------------------------------------------------------------------
-
-def _ocr_cell_crop(
- row_idx: int,
- col_idx: int,
- row: RowGeometry,
- col: PageRegion,
- ocr_img: np.ndarray,
- img_bgr: Optional[np.ndarray],
- img_w: int,
- img_h: int,
- engine_name: str,
- lang: str,
- lang_map: Dict[str, str],
-) -> Dict[str, Any]:
- """OCR a single cell by cropping the exact column×row intersection.
-
- No padding beyond cell boundaries → no neighbour bleeding.
- """
- # Display bbox: exact column × row intersection
- disp_x = col.x
- disp_y = row.y
- disp_w = col.width
- disp_h = row.height
-
- # Crop boundaries: add small internal padding (3px each side) to avoid
- # clipping characters near column/row edges (e.g. parentheses, descenders).
- # Stays within image bounds but may extend slightly beyond strict cell.
- # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
- _PAD = 3
- cx = max(0, disp_x - _PAD)
- cy = max(0, disp_y - _PAD)
- cx2 = min(img_w, disp_x + disp_w + _PAD)
- cy2 = min(img_h, disp_y + disp_h + _PAD)
- cw = cx2 - cx
- ch = cy2 - cy
-
- empty_cell = {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': '',
- 'confidence': 0.0,
- 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
- 'bbox_pct': {
- 'x': round(disp_x / img_w * 100, 2) if img_w else 0,
- 'y': round(disp_y / img_h * 100, 2) if img_h else 0,
- 'w': round(disp_w / img_w * 100, 2) if img_w else 0,
- 'h': round(disp_h / img_h * 100, 2) if img_h else 0,
- },
- 'ocr_engine': 'cell_crop_v2',
- 'is_bold': False,
- }
-
- if cw <= 0 or ch <= 0:
- logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
- return empty_cell
-
- # --- Pixel-density check: skip truly empty cells ---
- if ocr_img is not None:
- crop = ocr_img[cy:cy + ch, cx:cx + cw]
- if crop.size > 0:
- dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
- if dark_ratio < 0.005:
- logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
- row_idx, col_idx, dark_ratio, cw, ch)
- return empty_cell
-
- # --- Prepare crop for OCR ---
- cell_lang = lang_map.get(col.type, lang)
- psm = _select_psm_for_column(col.type, col.width, row.height)
- text = ''
- avg_conf = 0.0
- used_engine = 'cell_crop_v2'
-
- if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
- cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
- words = ocr_region_trocr(img_bgr, cell_region,
- handwritten=(engine_name == "trocr-handwritten"))
- elif engine_name == "lighton" and img_bgr is not None:
- cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
- words = ocr_region_lighton(img_bgr, cell_region)
- elif engine_name == "rapid" and img_bgr is not None:
- # Upscale small BGR crops for RapidOCR.
- # Cell crops typically have height 35-55px but width >300px.
- # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
- # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
- # For very short heights (< 80px), force 3× upscale for better OCR
- # of small characters like periods, ellipsis, and phonetic symbols.
- bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
- if bgr_crop.size == 0:
- words = []
- else:
- crop_h, crop_w = bgr_crop.shape[:2]
- if crop_h < 80:
- # Force 3× upscale for short rows — small chars need more pixels
- scale = 3.0
- bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
- interpolation=cv2.INTER_CUBIC)
- else:
- bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
- up_h, up_w = bgr_up.shape[:2]
- scale_x = up_w / max(crop_w, 1)
- scale_y = up_h / max(crop_h, 1)
- was_scaled = (up_w != crop_w or up_h != crop_h)
- logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
- row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
- tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
- words = ocr_region_rapid(bgr_up, tmp_region)
- # Remap positions back to original image coords
- if words and was_scaled:
- for w in words:
- w['left'] = int(w['left'] / scale_x) + cx
- w['top'] = int(w['top'] / scale_y) + cy
- w['width'] = int(w['width'] / scale_x)
- w['height'] = int(w['height'] / scale_y)
- elif words:
- for w in words:
- w['left'] += cx
- w['top'] += cy
- else:
- # Tesseract: upscale tiny crops for better recognition
- if ocr_img is not None:
- crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
- upscaled = _ensure_minimum_crop_size(crop_slice)
- up_h, up_w = upscaled.shape[:2]
- tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
- words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
- # Remap word positions back to original image coordinates
- if words and (up_w != cw or up_h != ch):
- sx = cw / max(up_w, 1)
- sy = ch / max(up_h, 1)
- for w in words:
- w['left'] = int(w['left'] * sx) + cx
- w['top'] = int(w['top'] * sy) + cy
- w['width'] = int(w['width'] * sx)
- w['height'] = int(w['height'] * sy)
- elif words:
- for w in words:
- w['left'] += cx
- w['top'] += cy
- else:
- words = []
-
- # Filter low-confidence words
- if words:
- words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
- if words:
- y_tol = max(15, ch)
- text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
- avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
- logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
- row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
- else:
- logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
- row_idx, col_idx, cw, ch, psm, engine_name)
-
- # --- PSM 7 fallback for still-empty Tesseract cells ---
- if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
- crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
- upscaled = _ensure_minimum_crop_size(crop_slice)
- up_h, up_w = upscaled.shape[:2]
- tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
- psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
- if psm7_words:
- psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if psm7_words:
- p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
- if p7_text.strip():
- text = p7_text
- avg_conf = round(
- sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
- )
- used_engine = 'cell_crop_v2_psm7'
- # Remap PSM7 word positions back to original image coords
- if up_w != cw or up_h != ch:
- sx = cw / max(up_w, 1)
- sy = ch / max(up_h, 1)
- for w in psm7_words:
- w['left'] = int(w['left'] * sx) + cx
- w['top'] = int(w['top'] * sy) + cy
- w['width'] = int(w['width'] * sx)
- w['height'] = int(w['height'] * sy)
- else:
- for w in psm7_words:
- w['left'] += cx
- w['top'] += cy
- words = psm7_words
-
- # --- Noise filter ---
- if text.strip():
- pre_filter = text
- text = _clean_cell_text_lite(text)
- if not text:
- logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
- row_idx, col_idx, pre_filter)
- avg_conf = 0.0
-
- result = dict(empty_cell)
- result['text'] = text
- result['confidence'] = avg_conf
- result['ocr_engine'] = used_engine
-
- # Store individual word bounding boxes (absolute image coordinates)
- # for pixel-accurate overlay positioning in the frontend.
- if words and text.strip():
- result['word_boxes'] = [
- {
- 'text': w.get('text', ''),
- 'left': w['left'],
- 'top': w['top'],
- 'width': w['width'],
- 'height': w['height'],
- 'conf': w.get('conf', 0),
- }
- for w in words
- if w.get('text', '').strip()
- ]
-
- return result
-
-
-# Threshold: columns narrower than this (% of image width) use single-cell
-# crop OCR instead of full-page word assignment.
-#
-# Broad columns (>= threshold): Full-page Tesseract word assignment.
-# Better for multi-word content (sentences, IPA brackets, punctuation).
-# Examples: EN vocabulary, DE translation, example sentences.
-#
-# Narrow columns (< threshold): Isolated cell-crop OCR.
-# Prevents neighbour bleeding from adjacent broad columns.
-# Examples: page_ref, marker, numbering columns.
-#
-# 15% was empirically validated across vocab table scans with 3-5 columns.
-# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
-# The 15% boundary cleanly separates the two groups.
-_NARROW_COL_THRESHOLD_PCT = 15.0
-
-
-def build_cell_grid_v2(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
- skip_heal_gaps: bool = False,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
- """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
-
- Drop-in replacement for build_cell_grid() — same signature & return type.
-
- Strategy:
- - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
- words (from row.words). Handles IPA brackets, punctuation, sentence
- continuity correctly.
- - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
- neighbour bleeding from adjacent broad columns.
- """
- engine_name = "tesseract"
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
- engine_name = "rapid"
-
- logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
-
- # Filter to content rows only
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- logger.warning("build_cell_grid_v2: no content rows found")
- return [], []
-
- # Filter phantom rows (word_count=0) and artifact rows
- before = len(content_rows)
- content_rows = [r for r in content_rows if r.word_count > 0]
- skipped = before - len(content_rows)
- if skipped > 0:
- logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
- if not content_rows:
- logger.warning("build_cell_grid_v2: no content rows with words found")
- return [], []
-
- before_art = len(content_rows)
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- artifact_skipped = before_art - len(content_rows)
- if artifact_skipped > 0:
- logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
- if not content_rows:
- logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
- return [], []
-
- # Filter columns
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
- 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- logger.warning("build_cell_grid_v2: no usable columns found")
- return [], []
-
- # Heal row gaps — use header/footer boundaries
- content_rows.sort(key=lambda r: r.y)
- header_rows = [r for r in row_geometries if r.row_type == 'header']
- footer_rows = [r for r in row_geometries if r.row_type == 'footer']
- if header_rows:
- top_bound = max(r.y + r.height for r in header_rows)
- else:
- top_bound = content_rows[0].y
- if footer_rows:
- bottom_bound = min(r.y for r in footer_rows)
- else:
- bottom_bound = content_rows[-1].y + content_rows[-1].height
-
- # skip_heal_gaps: When True, keep cell positions at their exact row geometry
- # positions without expanding to fill gaps from removed rows. Useful for
- # overlay rendering where pixel-precise positioning matters more than
- # full-coverage OCR crops.
- if not skip_heal_gaps:
- _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
- relevant_cols.sort(key=lambda c: c.x)
-
- columns_meta = [
- {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
- for ci, c in enumerate(relevant_cols)
- ]
-
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- # --- Classify columns as broad vs narrow ---
- narrow_col_indices = set()
- for ci, col in enumerate(relevant_cols):
- col_pct = (col.width / img_w * 100) if img_w > 0 else 0
- if col_pct < _NARROW_COL_THRESHOLD_PCT:
- narrow_col_indices.add(ci)
-
- broad_col_count = len(relevant_cols) - len(narrow_col_indices)
- logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
- f"{len(narrow_col_indices)} narrow columns (cell-crop)")
-
- # --- Phase 1: Broad columns via full-page word assignment ---
- cells: List[Dict[str, Any]] = []
-
- for row_idx, row in enumerate(content_rows):
- # Assign full-page words to columns for this row
- col_words = _assign_row_words_to_columns(row, relevant_cols)
-
- for col_idx, col in enumerate(relevant_cols):
- if col_idx not in narrow_col_indices:
- # BROAD column: use pre-assigned full-page words
- words = col_words.get(col_idx, [])
- # Filter low-confidence words
- words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
- # Single full-width column (box sub-session): preserve spacing
- is_single_full_column = (
- len(relevant_cols) == 1
- and img_w > 0
- and relevant_cols[0].width / img_w > 0.9
- )
-
- if words:
- y_tol = max(15, row.height)
- if is_single_full_column:
- text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
- logger.info(f"R{row_idx:02d}: {len(words)} words, "
- f"text={text!r:.100}")
- else:
- text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
- avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
- else:
- text = ''
- avg_conf = 0.0
- if is_single_full_column:
- logger.info(f"R{row_idx:02d}: 0 words (row has "
- f"{row.word_count} total, y={row.y}..{row.y+row.height})")
-
- # Apply noise filter — but NOT for single-column sub-sessions:
- # 1. _clean_cell_text strips trailing non-alpha tokens (e.g. €0.50,
- # £1, €2.50) which are valid content in box layouts.
- # 2. _clean_cell_text joins tokens with single space, destroying
- # the proportional spacing from _words_to_spaced_text.
- if not is_single_full_column:
- text = _clean_cell_text(text)
-
- cell = {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': text,
- 'confidence': avg_conf,
- 'bbox_px': {
- 'x': col.x, 'y': row.y,
- 'w': col.width, 'h': row.height,
- },
- 'bbox_pct': {
- 'x': round(col.x / img_w * 100, 2) if img_w else 0,
- 'y': round(row.y / img_h * 100, 2) if img_h else 0,
- 'w': round(col.width / img_w * 100, 2) if img_w else 0,
- 'h': round(row.height / img_h * 100, 2) if img_h else 0,
- },
- 'ocr_engine': 'word_lookup',
- 'is_bold': False,
- }
- # Store word bounding boxes for pixel-accurate overlay
- if words and text.strip():
- cell['word_boxes'] = [
- {
- 'text': w.get('text', ''),
- 'left': w['left'],
- 'top': w['top'],
- 'width': w['width'],
- 'height': w['height'],
- 'conf': w.get('conf', 0),
- }
- for w in words
- if w.get('text', '').strip()
- ]
- cells.append(cell)
-
- # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
- narrow_tasks = []
- for row_idx, row in enumerate(content_rows):
- for col_idx, col in enumerate(relevant_cols):
- if col_idx in narrow_col_indices:
- narrow_tasks.append((row_idx, col_idx, row, col))
-
- if narrow_tasks:
- max_workers = 4 if engine_name == "tesseract" else 2
- with ThreadPoolExecutor(max_workers=max_workers) as pool:
- futures = {
- pool.submit(
- _ocr_cell_crop,
- ri, ci, row, col,
- ocr_img, img_bgr, img_w, img_h,
- engine_name, lang, lang_map,
- ): (ri, ci)
- for ri, ci, row, col in narrow_tasks
- }
- for future in as_completed(futures):
- try:
- cell = future.result()
- cells.append(cell)
- except Exception as e:
- ri, ci = futures[future]
- logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
-
- # Sort cells by (row_index, col_index)
- cells.sort(key=lambda c: (c['row_index'], c['col_index']))
-
- # Remove all-empty rows
- rows_with_text: set = set()
- for cell in cells:
- if cell['text'].strip():
- rows_with_text.add(cell['row_index'])
- before_filter = len(cells)
- cells = [c for c in cells if c['row_index'] in rows_with_text]
- empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
- if empty_rows_removed > 0:
- logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
-
- # Bold detection disabled: cell-level stroke-width analysis cannot
- # distinguish bold from non-bold when cells contain mixed formatting
- # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
- # TODO: word-level bold detection would require per-word bounding boxes.
-
- logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
- f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
- f"engine={engine_name} (hybrid)")
-
- return cells, columns_meta
-
-
-def build_cell_grid_v2_streaming(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
- """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.
-
- Yields:
- (cell_dict, columns_meta, total_cells)
- """
- # Resolve engine — default to Tesseract for cell-first OCR.
- # Tesseract excels at isolated text crops (binarized, upscaled).
- # RapidOCR is optimized for full-page scene-text and produces artifacts
- # on small cell crops (extra chars, missing punctuation, garbled IPA).
- use_rapid = False
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "auto":
- engine_name = "tesseract"
- elif ocr_engine == "rapid":
- if not RAPIDOCR_AVAILABLE:
- logger.warning("RapidOCR requested but not available, falling back to Tesseract")
- else:
- use_rapid = True
- engine_name = "rapid" if use_rapid else "tesseract"
- else:
- engine_name = "tesseract"
-
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- return
-
- content_rows = [r for r in content_rows if r.word_count > 0]
- if not content_rows:
- return
-
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
- 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- return
-
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- if not content_rows:
- return
-
- # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
- content_rows.sort(key=lambda r: r.y)
- header_rows = [r for r in row_geometries if r.row_type == 'header']
- footer_rows = [r for r in row_geometries if r.row_type == 'footer']
- if header_rows:
- top_bound = max(r.y + r.height for r in header_rows)
- else:
- top_bound = content_rows[0].y
- if footer_rows:
- bottom_bound = min(r.y for r in footer_rows)
- else:
- bottom_bound = content_rows[-1].y + content_rows[-1].height
-
- _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
- relevant_cols.sort(key=lambda c: c.x)
-
- columns_meta = [
- {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
- for ci, c in enumerate(relevant_cols)
- ]
-
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- total_cells = len(content_rows) * len(relevant_cols)
-
- for row_idx, row in enumerate(content_rows):
- for col_idx, col in enumerate(relevant_cols):
- cell = _ocr_cell_crop(
- row_idx, col_idx, row, col,
- ocr_img, img_bgr, img_w, img_h,
- engine_name, lang, lang_map,
- )
- yield cell, columns_meta, total_cells
-
-
-# ---------------------------------------------------------------------------
-# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
-# ---------------------------------------------------------------------------
-
-def _compute_cell_padding(col_width: int, img_w: int) -> int:
- """Adaptive padding for OCR crops based on column width.
-
- Narrow columns (page_ref, marker) need more surrounding context so
- Tesseract can segment characters correctly. Wide columns keep the
- minimal 4 px padding to avoid pulling in neighbours.
- """
- col_pct = col_width / img_w * 100 if img_w > 0 else 100
- if col_pct < 5:
- return max(20, col_width // 2)
- if col_pct < 10:
- return max(12, col_width // 4)
- if col_pct < 15:
- return 8
- return 4
-
-
-def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
- max_scale: int = 3) -> np.ndarray:
- """Upscale tiny crops so Tesseract gets enough pixel data.
-
- If either dimension is below *min_dim*, the crop is bicubic-upscaled
- so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
- """
- h, w = crop.shape[:2]
- if h >= min_dim and w >= min_dim:
- return crop
- scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
- if scale <= 1.0:
- return crop
- new_w = int(w * scale)
- new_h = int(h * scale)
- return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
-
-
-def _select_psm_for_column(col_type: str, col_width: int,
- row_height: int) -> int:
- """Choose the best Tesseract PSM for a given column geometry.
-
- - page_ref columns are almost always single short tokens → PSM 8
- - Very narrow or short cells → PSM 7 (single text line)
- - Everything else → PSM 6 (uniform block)
- """
- if col_type in ('page_ref', 'marker'):
- return 8 # single word
- if col_width < 100 or row_height < 30:
- return 7 # single line
- return 6 # uniform block
-
-
-def _ocr_single_cell(
- row_idx: int,
- col_idx: int,
- row: RowGeometry,
- col: PageRegion,
- ocr_img: np.ndarray,
- img_bgr: Optional[np.ndarray],
- img_w: int,
- img_h: int,
- use_rapid: bool,
- engine_name: str,
- lang: str,
- lang_map: Dict[str, str],
- preassigned_words: Optional[List[Dict]] = None,
-) -> Dict[str, Any]:
- """Populate a single cell (column x row intersection) via word lookup."""
- # Display bbox: exact column × row intersection (no padding)
- disp_x = col.x
- disp_y = row.y
- disp_w = col.width
- disp_h = row.height
-
- # OCR crop: adaptive padding — narrow columns get more context
- pad = _compute_cell_padding(col.width, img_w)
- cell_x = max(0, col.x - pad)
- cell_y = max(0, row.y - pad)
- cell_w = min(col.width + 2 * pad, img_w - cell_x)
- cell_h = min(row.height + 2 * pad, img_h - cell_y)
- is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
-
- if disp_w <= 0 or disp_h <= 0:
- return {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': '',
- 'confidence': 0.0,
- 'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
- 'bbox_pct': {
- 'x': round(col.x / img_w * 100, 2),
- 'y': round(row.y / img_h * 100, 2),
- 'w': round(col.width / img_w * 100, 2),
- 'h': round(row.height / img_h * 100, 2),
- },
- 'ocr_engine': 'word_lookup',
- }
-
- # --- PRIMARY: Word-lookup from full-page Tesseract ---
- words = preassigned_words if preassigned_words is not None else []
- used_engine = 'word_lookup'
-
- # Filter low-confidence words (OCR noise from images/artifacts).
- # Tesseract gives low confidence to misread image edges, borders,
- # and other non-text elements.
- if words:
- words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
- if words:
- # Use row height as Y-tolerance so all words within a single row
- # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
- # across two lines due to slight vertical offset).
- y_tol = max(15, row.height)
- text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
- avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
- else:
- text = ''
- avg_conf = 0.0
-
- # --- FALLBACK: Cell-OCR for empty cells ---
- # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
- # Re-run OCR on the cell crop to catch what word-lookup missed.
- # To avoid wasting time on truly empty cells, check pixel density first:
- # only run Tesseract if the cell crop contains enough dark pixels to
- # plausibly contain text.
- _run_fallback = False
- if not text.strip() and cell_w > 0 and cell_h > 0:
- if ocr_img is not None:
- crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
- if crop.size > 0:
- # Threshold: pixels darker than 180 (on 0-255 grayscale).
- # Use 0.5% to catch even small text like "Ei" (2 chars)
- # in an otherwise empty cell.
- dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
- _run_fallback = dark_ratio > 0.005
- if _run_fallback:
- # For narrow columns, upscale the crop before OCR
- if is_narrow and ocr_img is not None:
- _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
- _upscaled = _ensure_minimum_crop_size(_crop_slice)
- if _upscaled is not _crop_slice:
- # Build a temporary full-size image with the upscaled crop
- # placed at origin so ocr_region can crop it cleanly.
- _up_h, _up_w = _upscaled.shape[:2]
- _tmp_region = PageRegion(
- type=col.type, x=0, y=0, width=_up_w, height=_up_h,
- )
- _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
- cell_lang = lang_map.get(col.type, lang)
- fallback_words = ocr_region(_upscaled, _tmp_region,
- lang=cell_lang, psm=_cell_psm)
- # Remap word positions back to original image coordinates
- _sx = cell_w / max(_up_w, 1)
- _sy = cell_h / max(_up_h, 1)
- for _fw in (fallback_words or []):
- _fw['left'] = int(_fw['left'] * _sx) + cell_x
- _fw['top'] = int(_fw['top'] * _sy) + cell_y
- _fw['width'] = int(_fw['width'] * _sx)
- _fw['height'] = int(_fw['height'] * _sy)
- else:
- # No upscaling needed, use adaptive PSM
- cell_region = PageRegion(
- type=col.type, x=cell_x, y=cell_y,
- width=cell_w, height=cell_h,
- )
- _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
- cell_lang = lang_map.get(col.type, lang)
- fallback_words = ocr_region(ocr_img, cell_region,
- lang=cell_lang, psm=_cell_psm)
- else:
- cell_region = PageRegion(
- type=col.type,
- x=cell_x, y=cell_y,
- width=cell_w, height=cell_h,
- )
- if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
- fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
- elif engine_name == "lighton" and img_bgr is not None:
- fallback_words = ocr_region_lighton(img_bgr, cell_region)
- elif use_rapid and img_bgr is not None:
- fallback_words = ocr_region_rapid(img_bgr, cell_region)
- else:
- _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
- cell_lang = lang_map.get(col.type, lang)
- fallback_words = ocr_region(ocr_img, cell_region,
- lang=cell_lang, psm=_cell_psm)
-
- if fallback_words:
- # Apply same confidence filter to fallback words
- fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if fallback_words:
- fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
- fb_y_tol = max(10, int(fb_avg_h * 0.5))
- fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
- if fb_text.strip():
- text = fb_text
- avg_conf = round(
- sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
- )
- used_engine = 'cell_ocr_fallback'
-
- # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
- if not text.strip() and _run_fallback and not use_rapid:
- _fb_region = PageRegion(
- type=col.type, x=cell_x, y=cell_y,
- width=cell_w, height=cell_h,
- )
- cell_lang = lang_map.get(col.type, lang)
- psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
- if psm7_words:
- psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if psm7_words:
- p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
- if p7_text.strip():
- text = p7_text
- avg_conf = round(
- sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
- )
- used_engine = 'cell_ocr_psm7'
-
- # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
- # If a narrow cell is still empty, OCR the entire row strip with
- # RapidOCR (which handles small text better) and assign words by
- # X-position overlap with this column.
- if not text.strip() and is_narrow and img_bgr is not None:
- row_region = PageRegion(
- type='_row_strip', x=0, y=row.y,
- width=img_w, height=row.height,
- )
- strip_words = ocr_region_rapid(img_bgr, row_region)
- if strip_words:
- # Filter to words overlapping this column's X-range
- col_left = col.x
- col_right = col.x + col.width
- col_words = []
- for sw in strip_words:
- sw_left = sw.get('left', 0)
- sw_right = sw_left + sw.get('width', 0)
- overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
- if overlap > sw.get('width', 1) * 0.3:
- col_words.append(sw)
- if col_words:
- col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if col_words:
- rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
- if rs_text.strip():
- text = rs_text
- avg_conf = round(
- sum(w['conf'] for w in col_words) / len(col_words), 1
- )
- used_engine = 'row_strip_rapid'
-
- # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
- if text.strip():
- text = _clean_cell_text(text)
- if not text:
- avg_conf = 0.0
-
- return {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': text,
- 'confidence': avg_conf,
- 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
- 'bbox_pct': {
- 'x': round(disp_x / img_w * 100, 2),
- 'y': round(disp_y / img_h * 100, 2),
- 'w': round(disp_w / img_w * 100, 2),
- 'h': round(disp_h / img_h * 100, 2),
- },
- 'ocr_engine': used_engine,
- }
-
-
-def _is_artifact_row(row: RowGeometry) -> bool:
- """Return True if this row contains only scan artifacts, not real text.
-
- Artifact rows (scanner shadows, noise) typically produce only single-character
- detections. A real content row always has at least one token with 2+ characters.
- """
- if row.word_count == 0:
- return True
- texts = [w.get('text', '').strip() for w in row.words]
- return all(len(t) <= 1 for t in texts)
-
-
-def _heal_row_gaps(
- rows: List[RowGeometry],
- top_bound: int,
- bottom_bound: int,
-) -> None:
- """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
-
- After filtering out empty or artifact rows, remaining content rows may have
- gaps between them where the removed rows used to be. This function mutates
- each row to extend upward/downward to the midpoint of such gaps so that
- OCR crops cover the full available content area.
-
- The first row always extends to top_bound; the last row to bottom_bound.
- """
- if not rows:
- return
- rows.sort(key=lambda r: r.y)
- n = len(rows)
- orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
-
- for i, row in enumerate(rows):
- # New top: midpoint between previous row's bottom and this row's top
- if i == 0:
- new_top = top_bound
- else:
- prev_bot = orig[i - 1][1]
- my_top = orig[i][0]
- gap = my_top - prev_bot
- new_top = prev_bot + gap // 2 if gap > 1 else my_top
-
- # New bottom: midpoint between this row's bottom and next row's top
- if i == n - 1:
- new_bottom = bottom_bound
- else:
- my_bot = orig[i][1]
- next_top = orig[i + 1][0]
- gap = next_top - my_bot
- new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
-
- row.y = new_top
- row.height = max(5, new_bottom - new_top)
-
- logger.debug(
- f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
- f"(bounds: top={top_bound}, bottom={bottom_bound})"
- )
-
-
-def build_cell_grid(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
- """Generic Cell-Grid: Columns × Rows → cells with OCR text.
-
- This is the layout-agnostic foundation. Every column (except column_ignore)
- is intersected with every content row to produce numbered cells.
-
- Args:
- ocr_img: Binarized full-page image (for Tesseract).
- column_regions: Classified columns from Step 3 (PageRegion list).
- row_geometries: Rows from Step 4 (RowGeometry list).
- img_w: Image width in pixels.
- img_h: Image height in pixels.
- lang: Default Tesseract language.
- ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
- img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
-
- Returns:
- (cells, columns_meta) where cells is a list of cell dicts and
- columns_meta describes the columns used.
- """
- # Resolve engine choice
- use_rapid = False
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "auto":
- use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
- engine_name = "rapid" if use_rapid else "tesseract"
- elif ocr_engine == "rapid":
- if not RAPIDOCR_AVAILABLE:
- logger.warning("RapidOCR requested but not available, falling back to Tesseract")
- else:
- use_rapid = True
- engine_name = "rapid" if use_rapid else "tesseract"
- else:
- engine_name = "tesseract"
-
- logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
-
- # Filter to content rows only (skip header/footer)
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- logger.warning("build_cell_grid: no content rows found")
- return [], []
-
- # Filter phantom rows: rows with no Tesseract words assigned are
- # inter-line whitespace gaps that would produce garbage OCR.
- before = len(content_rows)
- content_rows = [r for r in content_rows if r.word_count > 0]
- skipped = before - len(content_rows)
- if skipped > 0:
- logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
- if not content_rows:
- logger.warning("build_cell_grid: no content rows with words found")
- return [], []
-
- # Use columns only — skip ignore, header, footer, page_ref
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- logger.warning("build_cell_grid: no usable columns found")
- return [], []
-
- # Filter artifact rows: rows whose detected words are all single characters
- # are caused by scanner shadows or noise, not real text.
- before_art = len(content_rows)
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- artifact_skipped = before_art - len(content_rows)
- if artifact_skipped > 0:
- logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
- if not content_rows:
- logger.warning("build_cell_grid: no content rows after artifact filtering")
- return [], []
-
- # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
- # to fill the space so OCR crops are not artificially narrow.
- _heal_row_gaps(
- content_rows,
- top_bound=min(c.y for c in relevant_cols),
- bottom_bound=max(c.y + c.height for c in relevant_cols),
- )
-
- # Sort columns left-to-right
- relevant_cols.sort(key=lambda c: c.x)
-
- # Build columns_meta
- columns_meta = [
- {
- 'index': col_idx,
- 'type': col.type,
- 'x': col.x,
- 'width': col.width,
- }
- for col_idx, col in enumerate(relevant_cols)
- ]
-
- # Choose OCR language per column type (Tesseract only)
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- cells: List[Dict[str, Any]] = []
-
- for row_idx, row in enumerate(content_rows):
- # Pre-assign each word to exactly one column (nearest center)
- col_words = _assign_row_words_to_columns(row, relevant_cols)
- for col_idx, col in enumerate(relevant_cols):
- cell = _ocr_single_cell(
- row_idx, col_idx, row, col,
- ocr_img, img_bgr, img_w, img_h,
- use_rapid, engine_name, lang, lang_map,
- preassigned_words=col_words[col_idx],
- )
- cells.append(cell)
-
- # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
- # Collect cells that are still empty but have visible pixels.
- # Instead of calling Tesseract once per cell (expensive), crop an entire
- # column strip and run OCR once, then assign words to cells by Y position.
- empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
- for ci, cell in enumerate(cells):
- if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
- bpx = cell['bbox_px']
- x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
- if w > 0 and h > 0 and ocr_img is not None:
- crop = ocr_img[y:y + h, x:x + w]
- if crop.size > 0:
- dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
- if dark_ratio > 0.005:
- empty_by_col.setdefault(cell['col_index'], []).append(ci)
-
- for col_idx, cell_indices in empty_by_col.items():
- if len(cell_indices) < 3:
- continue # Not worth batching for < 3 cells
-
- # Find the column strip bounding box (union of all empty cell bboxes)
- min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
- max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
- col_x = cells[cell_indices[0]]['bbox_px']['x']
- col_w = cells[cell_indices[0]]['bbox_px']['w']
-
- strip_region = PageRegion(
- type=relevant_cols[col_idx].type,
- x=col_x, y=min_y,
- width=col_w, height=max_y_h - min_y,
- )
- strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
-
- if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
- strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
- elif engine_name == "lighton" and img_bgr is not None:
- strip_words = ocr_region_lighton(img_bgr, strip_region)
- elif use_rapid and img_bgr is not None:
- strip_words = ocr_region_rapid(img_bgr, strip_region)
- else:
- strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
-
- if not strip_words:
- continue
-
- strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
- if not strip_words:
- continue
-
- # Assign words to cells by Y overlap
- for ci in cell_indices:
- cell_y = cells[ci]['bbox_px']['y']
- cell_h = cells[ci]['bbox_px']['h']
- cell_mid_y = cell_y + cell_h / 2
-
- matched_words = [
- w for w in strip_words
- if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
- ]
- if matched_words:
- matched_words.sort(key=lambda w: w['left'])
- batch_text = ' '.join(w['text'] for w in matched_words)
- batch_text = _clean_cell_text(batch_text)
- if batch_text.strip():
- cells[ci]['text'] = batch_text
- cells[ci]['confidence'] = round(
- sum(w['conf'] for w in matched_words) / len(matched_words), 1
- )
- cells[ci]['ocr_engine'] = 'batch_column_ocr'
-
- batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
- if batch_filled > 0:
- logger.info(
- f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
- f"empty cells in column {col_idx}"
- )
-
- # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
- # that had stray Tesseract artifacts giving word_count > 0).
- rows_with_text: set = set()
- for cell in cells:
- if cell['text'].strip():
- rows_with_text.add(cell['row_index'])
- before_filter = len(cells)
- cells = [c for c in cells if c['row_index'] in rows_with_text]
- empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
- if empty_rows_removed > 0:
- logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
-
- logger.info(f"build_cell_grid: {len(cells)} cells from "
- f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
- f"engine={engine_name}")
-
- return cells, columns_meta
-
-
-def build_cell_grid_streaming(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
- """Like build_cell_grid(), but yields each cell as it is OCR'd.
-
- Yields:
- (cell_dict, columns_meta, total_cells) for each cell.
- """
- # Resolve engine choice (same as build_cell_grid)
- use_rapid = False
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "auto":
- use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
- engine_name = "rapid" if use_rapid else "tesseract"
- elif ocr_engine == "rapid":
- if not RAPIDOCR_AVAILABLE:
- logger.warning("RapidOCR requested but not available, falling back to Tesseract")
- else:
- use_rapid = True
- engine_name = "rapid" if use_rapid else "tesseract"
- else:
- engine_name = "tesseract"
-
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- return
-
- # Filter phantom rows: rows with no Tesseract words assigned are
- # inter-line whitespace gaps that would produce garbage OCR.
- before = len(content_rows)
- content_rows = [r for r in content_rows if r.word_count > 0]
- skipped = before - len(content_rows)
- if skipped > 0:
- logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
- if not content_rows:
- return
-
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- return
-
- # Filter artifact rows + heal gaps (same logic as build_cell_grid)
- before_art = len(content_rows)
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- artifact_skipped = before_art - len(content_rows)
- if artifact_skipped > 0:
- logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
- if not content_rows:
- return
- _heal_row_gaps(
- content_rows,
- top_bound=min(c.y for c in relevant_cols),
- bottom_bound=max(c.y + c.height for c in relevant_cols),
- )
-
- relevant_cols.sort(key=lambda c: c.x)
-
- columns_meta = [
- {
- 'index': col_idx,
- 'type': col.type,
- 'x': col.x,
- 'width': col.width,
- }
- for col_idx, col in enumerate(relevant_cols)
- ]
-
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- total_cells = len(content_rows) * len(relevant_cols)
-
- for row_idx, row in enumerate(content_rows):
- # Pre-assign each word to exactly one column (nearest center)
- col_words = _assign_row_words_to_columns(row, relevant_cols)
- for col_idx, col in enumerate(relevant_cols):
- cell = _ocr_single_cell(
- row_idx, col_idx, row, col,
- ocr_img, img_bgr, img_w, img_h,
- use_rapid, engine_name, lang, lang_map,
- preassigned_words=col_words[col_idx],
- )
- yield cell, columns_meta, total_cells
-
-
-def _cells_to_vocab_entries(
- cells: List[Dict[str, Any]],
- columns_meta: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
- """Map generic cells to vocab entries with english/german/example fields.
-
- Groups cells by row_index, maps col_type → field name, and produces
- one entry per row (only rows with at least one non-empty field).
- """
- # Determine image dimensions from first cell (for row-level bbox)
- col_type_to_field = {
- 'column_en': 'english',
- 'column_de': 'german',
- 'column_example': 'example',
- 'page_ref': 'source_page',
- 'column_marker': 'marker',
- 'column_text': 'text', # generic single-column (box sub-sessions)
- }
- bbox_key_map = {
- 'column_en': 'bbox_en',
- 'column_de': 'bbox_de',
- 'column_example': 'bbox_ex',
- 'page_ref': 'bbox_ref',
- 'column_marker': 'bbox_marker',
- 'column_text': 'bbox_text',
- }
-
- # Group cells by row_index
- rows: Dict[int, List[Dict]] = {}
- for cell in cells:
- ri = cell['row_index']
- rows.setdefault(ri, []).append(cell)
-
- entries: List[Dict[str, Any]] = []
- for row_idx in sorted(rows.keys()):
- row_cells = rows[row_idx]
- entry: Dict[str, Any] = {
- 'row_index': row_idx,
- 'english': '',
- 'german': '',
- 'example': '',
- 'text': '', # generic single-column (box sub-sessions)
- 'source_page': '',
- 'marker': '',
- 'confidence': 0.0,
- 'bbox': None,
- 'bbox_en': None,
- 'bbox_de': None,
- 'bbox_ex': None,
- 'bbox_ref': None,
- 'bbox_marker': None,
- 'bbox_text': None,
- 'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
- }
-
- confidences = []
- for cell in row_cells:
- col_type = cell['col_type']
- field = col_type_to_field.get(col_type)
- if field:
- entry[field] = cell['text']
- bbox_field = bbox_key_map.get(col_type)
- if bbox_field:
- entry[bbox_field] = cell['bbox_pct']
- if cell['confidence'] > 0:
- confidences.append(cell['confidence'])
-
- # Compute row-level bbox as union of all cell bboxes
- all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
- if all_bboxes:
- min_x = min(b['x'] for b in all_bboxes)
- min_y = min(b['y'] for b in all_bboxes)
- max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
- max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
- entry['bbox'] = {
- 'x': round(min_x, 2),
- 'y': round(min_y, 2),
- 'w': round(max_x2 - min_x, 2),
- 'h': round(max_y2 - min_y, 2),
- }
-
- entry['confidence'] = round(
- sum(confidences) / len(confidences), 1
- ) if confidences else 0.0
-
- # Only include if at least one mapped field has text
- has_content = any(
- entry.get(f)
- for f in col_type_to_field.values()
- )
- if has_content:
- entries.append(entry)
-
- return entries
-
-
-# Regex: line starts with phonetic bracket content only (no real word before it)
-_PHONETIC_ONLY_RE = re.compile(
- r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+# --- v2 build (current default) ---
+from cv_cell_grid_build import ( # noqa: F401
+ _NARROW_COL_THRESHOLD_PCT,
+ _ocr_cell_crop,
+ build_cell_grid_v2,
)
+# --- Legacy build (DEPRECATED) ---
+from cv_cell_grid_legacy import ( # noqa: F401
+ _ocr_single_cell,
+ build_cell_grid,
+)
-def _is_phonetic_only_text(text: str) -> bool:
- """Check if text consists only of phonetic transcription.
+# --- Streaming variants ---
+from cv_cell_grid_streaming import ( # noqa: F401
+ build_cell_grid_streaming,
+ build_cell_grid_v2_streaming,
+)
- Phonetic-only patterns:
- ['mani serva] → True
- [dɑːns] → True
- ["a:mand] → True
- almond ['a:mand] → False (has real word before bracket)
- Mandel → False
- """
- t = text.strip()
- if not t:
- return False
- # Must contain at least one bracket
- if '[' not in t and ']' not in t:
- return False
- # Remove all bracket content and surrounding punctuation/whitespace
- without_brackets = re.sub(r"\[.*?\]", '', t)
- without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
- # If nothing meaningful remains, it's phonetic-only
- alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
- return len(alpha_remaining) < 2
-
-
-def _merge_phonetic_continuation_rows(
- entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
- """Merge rows that contain only phonetic transcription into previous entry.
-
- In dictionary pages, phonetic transcription sometimes wraps to the next
- row. E.g.:
- Row 28: EN="it's a money-saver" DE="es spart Kosten"
- Row 29: EN="['mani serva]" DE=""
-
- Row 29 is phonetic-only → merge into row 28's EN field.
- """
- if len(entries) < 2:
- return entries
-
- merged: List[Dict[str, Any]] = []
- for entry in entries:
- en = (entry.get('english') or '').strip()
- de = (entry.get('german') or '').strip()
- ex = (entry.get('example') or '').strip()
-
- # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
- if merged and _is_phonetic_only_text(en) and not de:
- prev = merged[-1]
- prev_en = (prev.get('english') or '').strip()
- # Append phonetic to previous entry's EN
- if prev_en:
- prev['english'] = prev_en + ' ' + en
- else:
- prev['english'] = en
- # If there was an example, append to previous too
- if ex:
- prev_ex = (prev.get('example') or '').strip()
- prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
- logger.debug(
- f"Merged phonetic row {entry.get('row_index')} "
- f"into previous entry: {prev['english']!r}"
- )
- continue
-
- merged.append(entry)
-
- return merged
-
-
-def _merge_wrapped_rows(
- entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
- """Merge rows where the primary column (EN) is empty — cell wrap continuation.
-
- In textbook vocabulary tables, columns are often narrow, so the author
- wraps text within a cell. OCR treats each physical line as a separate row.
- The key indicator: if the EN column is empty but DE/example have text,
- this row is a continuation of the previous row's cells.
-
- Example (original textbook has ONE row):
- Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took"
- Row 3: EN="" DE="(bei)" EX="part in the concert."
- → Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="More than 200 singers took part in the concert."
-
- Also handles the reverse case: DE empty but EN has text (wrap in EN column).
- """
- if len(entries) < 2:
- return entries
-
- merged: List[Dict[str, Any]] = []
- for entry in entries:
- en = (entry.get('english') or '').strip()
- de = (entry.get('german') or '').strip()
- ex = (entry.get('example') or '').strip()
-
- if not merged:
- merged.append(entry)
- continue
-
- prev = merged[-1]
- prev_en = (prev.get('english') or '').strip()
- prev_de = (prev.get('german') or '').strip()
- prev_ex = (prev.get('example') or '').strip()
-
- # Case 1: EN is empty → continuation of previous row
- # (DE or EX have text that should be appended to previous row)
- if not en and (de or ex) and prev_en:
- if de:
- if prev_de.endswith(','):
- sep = ' ' # "Wort," + " " + "Ausdruck"
- elif prev_de.endswith(('-', '(')):
- sep = '' # "teil-" + "nehmen" or "(" + "bei)"
- else:
- sep = ' '
- prev['german'] = (prev_de + sep + de).strip()
- if ex:
- sep = ' ' if prev_ex else ''
- prev['example'] = (prev_ex + sep + ex).strip()
- logger.debug(
- f"Merged wrapped row {entry.get('row_index')} into previous "
- f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
- )
- continue
-
- # Case 2: DE is empty, EN has text that looks like continuation
- # (starts with lowercase or is a parenthetical like "(bei)")
- if en and not de and prev_de:
- is_paren = en.startswith('(')
- first_alpha = next((c for c in en if c.isalpha()), '')
- starts_lower = first_alpha and first_alpha.islower()
-
- if (is_paren or starts_lower) and len(en.split()) < 5:
- sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
- prev['english'] = (prev_en + sep + en).strip()
- if ex:
- sep2 = ' ' if prev_ex else ''
- prev['example'] = (prev_ex + sep2 + ex).strip()
- logger.debug(
- f"Merged wrapped row {entry.get('row_index')} into previous "
- f"(empty DE): EN={prev['english']!r}"
- )
- continue
-
- merged.append(entry)
-
- if len(merged) < len(entries):
- logger.info(
- f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
- f"continuation rows ({len(entries)} → {len(merged)})"
- )
- return merged
-
-
-def _merge_continuation_rows(
- entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
- """Merge multi-line vocabulary entries where text wraps to the next row.
-
- A row is a continuation of the previous entry when:
- - EN has text, but DE is empty
- - EN starts with a lowercase letter (not a new vocab entry)
- - Previous entry's EN does NOT end with a sentence terminator (.!?)
- - The continuation text has fewer than 4 words (not an example sentence)
- - The row was not already merged as phonetic
-
- Example:
- Row 5: EN="to put up" DE="aufstellen"
- Row 6: EN="with sth." DE=""
- → Merged: EN="to put up with sth." DE="aufstellen"
- """
- if len(entries) < 2:
- return entries
-
- merged: List[Dict[str, Any]] = []
- for entry in entries:
- en = (entry.get('english') or '').strip()
- de = (entry.get('german') or '').strip()
-
- if merged and en and not de:
- # Check: not phonetic (already handled)
- if _is_phonetic_only_text(en):
- merged.append(entry)
- continue
-
- # Check: starts with lowercase
- first_alpha = next((c for c in en if c.isalpha()), '')
- starts_lower = first_alpha and first_alpha.islower()
-
- # Check: fewer than 4 words (not an example sentence)
- word_count = len(en.split())
- is_short = word_count < 4
-
- # Check: previous entry doesn't end with sentence terminator
- prev = merged[-1]
- prev_en = (prev.get('english') or '').strip()
- prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
-
- if starts_lower and is_short and not prev_ends_sentence:
- # Merge into previous entry
- prev['english'] = (prev_en + ' ' + en).strip()
- # Merge example if present
- ex = (entry.get('example') or '').strip()
- if ex:
- prev_ex = (prev.get('example') or '').strip()
- prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
- logger.debug(
- f"Merged continuation row {entry.get('row_index')} "
- f"into previous entry: {prev['english']!r}"
- )
- continue
-
- merged.append(entry)
-
- return merged
-
-
-def build_word_grid(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
- pronunciation: str = "british",
-) -> List[Dict[str, Any]]:
- """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
-
- Wrapper around build_cell_grid() that adds vocabulary-specific logic:
- - Maps cells to english/german/example entries
- - Applies character confusion fixes, IPA lookup, comma splitting, etc.
- - Falls back to returning raw cells if no vocab columns detected.
-
- Args:
- ocr_img: Binarized full-page image (for Tesseract).
- column_regions: Classified columns from Step 3.
- row_geometries: Rows from Step 4.
- img_w, img_h: Image dimensions.
- lang: Default Tesseract language.
- ocr_engine: 'tesseract', 'rapid', or 'auto'.
- img_bgr: BGR color image (required for RapidOCR).
- pronunciation: 'british' or 'american' for IPA lookup.
-
- Returns:
- List of entry dicts with english/german/example text and bbox info (percent).
- """
- cells, columns_meta = build_cell_grid(
- ocr_img, column_regions, row_geometries, img_w, img_h,
- lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
- )
-
- if not cells:
- return []
-
- # Check if vocab layout is present
- col_types = {c['type'] for c in columns_meta}
- if not (col_types & {'column_en', 'column_de'}):
- logger.info("build_word_grid: no vocab columns — returning raw cells")
- return cells
-
- # Vocab mapping: cells → entries
- entries = _cells_to_vocab_entries(cells, columns_meta)
-
- # --- Post-processing pipeline (deterministic, no LLM) ---
- n_raw = len(entries)
-
- # 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
- entries = _merge_wrapped_rows(entries)
-
- # 0a. Merge phonetic-only continuation rows into previous entry
- entries = _merge_phonetic_continuation_rows(entries)
-
- # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
- entries = _merge_continuation_rows(entries)
-
- # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
- # llm_review_entries_streaming so changes are visible to the user in Step 6.
-
- # 2. Replace OCR'd phonetics with dictionary IPA
- entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-
- # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
- entries = _split_comma_entries(entries)
-
- # 4. Attach example sentences (rows without DE → examples for preceding entry)
- entries = _attach_example_sentences(entries)
-
- engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
- logger.info(f"build_word_grid: {len(entries)} entries from "
- f"{n_raw} raw → {len(entries)} after post-processing "
- f"(engine={engine_name})")
-
- return entries
+# --- Row merging ---
+from cv_cell_grid_merge import ( # noqa: F401
+ _PHONETIC_ONLY_RE,
+ _is_phonetic_only_text,
+ _merge_continuation_rows,
+ _merge_phonetic_continuation_rows,
+ _merge_wrapped_rows,
+)
+# --- Vocab extraction ---
+from cv_cell_grid_vocab import ( # noqa: F401
+ _cells_to_vocab_entries,
+ build_word_grid,
+)
diff --git a/klausur-service/backend/cv_cell_grid_build.py b/klausur-service/backend/cv_cell_grid_build.py
new file mode 100644
index 0000000..9ac0ac5
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_build.py
@@ -0,0 +1,498 @@
+"""
+Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop).
+Extracted from cv_cell_grid.py.
+Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+ RAPIDOCR_AVAILABLE,
+ _assign_row_words_to_columns,
+ _clean_cell_text,
+ _clean_cell_text_lite,
+ _words_to_reading_order_text,
+ _words_to_spaced_text,
+ ocr_region_lighton,
+ ocr_region_rapid,
+ ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+ _MIN_WORD_CONF,
+ _ensure_minimum_crop_size,
+ _heal_row_gaps,
+ _is_artifact_row,
+ _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+
+# ---------------------------------------------------------------------------
+# _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode
+# ---------------------------------------------------------------------------
+
+def _ocr_cell_crop(
+ row_idx: int,
+ col_idx: int,
+ row: RowGeometry,
+ col: PageRegion,
+ ocr_img: np.ndarray,
+ img_bgr: Optional[np.ndarray],
+ img_w: int,
+ img_h: int,
+ engine_name: str,
+ lang: str,
+ lang_map: Dict[str, str],
+) -> Dict[str, Any]:
+ """OCR a single cell by cropping the exact column x row intersection.
+
+ No padding beyond cell boundaries -> no neighbour bleeding.
+ """
+ # Display bbox: exact column x row intersection
+ disp_x = col.x
+ disp_y = row.y
+ disp_w = col.width
+ disp_h = row.height
+
+ # Crop boundaries: add small internal padding (3px each side) to avoid
+ # clipping characters near column/row edges (e.g. parentheses, descenders).
+ # Stays within image bounds but may extend slightly beyond strict cell.
+ # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
+ _PAD = 3
+ cx = max(0, disp_x - _PAD)
+ cy = max(0, disp_y - _PAD)
+ cx2 = min(img_w, disp_x + disp_w + _PAD)
+ cy2 = min(img_h, disp_y + disp_h + _PAD)
+ cw = cx2 - cx
+ ch = cy2 - cy
+
+ empty_cell = {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': '',
+ 'confidence': 0.0,
+ 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+ 'bbox_pct': {
+ 'x': round(disp_x / img_w * 100, 2) if img_w else 0,
+ 'y': round(disp_y / img_h * 100, 2) if img_h else 0,
+ 'w': round(disp_w / img_w * 100, 2) if img_w else 0,
+ 'h': round(disp_h / img_h * 100, 2) if img_h else 0,
+ },
+ 'ocr_engine': 'cell_crop_v2',
+ 'is_bold': False,
+ }
+
+ if cw <= 0 or ch <= 0:
+ logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+ return empty_cell
+
+ # --- Pixel-density check: skip truly empty cells ---
+ if ocr_img is not None:
+ crop = ocr_img[cy:cy + ch, cx:cx + cw]
+ if crop.size > 0:
+ dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+ if dark_ratio < 0.005:
+ logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+ row_idx, col_idx, dark_ratio, cw, ch)
+ return empty_cell
+
+ # --- Prepare crop for OCR ---
+ cell_lang = lang_map.get(col.type, lang)
+ psm = _select_psm_for_column(col.type, col.width, row.height)
+ text = ''
+ avg_conf = 0.0
+ used_engine = 'cell_crop_v2'
+
+ if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+ cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+ words = ocr_region_trocr(img_bgr, cell_region,
+ handwritten=(engine_name == "trocr-handwritten"))
+ elif engine_name == "lighton" and img_bgr is not None:
+ cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+ words = ocr_region_lighton(img_bgr, cell_region)
+ elif engine_name == "rapid" and img_bgr is not None:
+ # Upscale small BGR crops for RapidOCR.
+ bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+ if bgr_crop.size == 0:
+ words = []
+ else:
+ crop_h, crop_w = bgr_crop.shape[:2]
+ if crop_h < 80:
+ # Force 3x upscale for short rows — small chars need more pixels
+ scale = 3.0
+ bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+ interpolation=cv2.INTER_CUBIC)
+ else:
+ bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+ up_h, up_w = bgr_up.shape[:2]
+ scale_x = up_w / max(crop_w, 1)
+ scale_y = up_h / max(crop_h, 1)
+ was_scaled = (up_w != crop_w or up_h != crop_h)
+ logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+ row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
+ tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+ words = ocr_region_rapid(bgr_up, tmp_region)
+ # Remap positions back to original image coords
+ if words and was_scaled:
+ for w in words:
+ w['left'] = int(w['left'] / scale_x) + cx
+ w['top'] = int(w['top'] / scale_y) + cy
+ w['width'] = int(w['width'] / scale_x)
+ w['height'] = int(w['height'] / scale_y)
+ elif words:
+ for w in words:
+ w['left'] += cx
+ w['top'] += cy
+ else:
+ # Tesseract: upscale tiny crops for better recognition
+ if ocr_img is not None:
+ crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+ upscaled = _ensure_minimum_crop_size(crop_slice)
+ up_h, up_w = upscaled.shape[:2]
+ tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+ words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
+ # Remap word positions back to original image coordinates
+ if words and (up_w != cw or up_h != ch):
+ sx = cw / max(up_w, 1)
+ sy = ch / max(up_h, 1)
+ for w in words:
+ w['left'] = int(w['left'] * sx) + cx
+ w['top'] = int(w['top'] * sy) + cy
+ w['width'] = int(w['width'] * sx)
+ w['height'] = int(w['height'] * sy)
+ elif words:
+ for w in words:
+ w['left'] += cx
+ w['top'] += cy
+ else:
+ words = []
+
+ # Filter low-confidence words
+ if words:
+ words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+ if words:
+ y_tol = max(15, ch)
+ text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+ avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+ logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+ row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+ else:
+ logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+ row_idx, col_idx, cw, ch, psm, engine_name)
+
+ # --- PSM 7 fallback for still-empty Tesseract cells ---
+ if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
+ crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+ upscaled = _ensure_minimum_crop_size(crop_slice)
+ up_h, up_w = upscaled.shape[:2]
+ tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+ psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
+ if psm7_words:
+ psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if psm7_words:
+ p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+ if p7_text.strip():
+ text = p7_text
+ avg_conf = round(
+ sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+ )
+ used_engine = 'cell_crop_v2_psm7'
+ # Remap PSM7 word positions back to original image coords
+ if up_w != cw or up_h != ch:
+ sx = cw / max(up_w, 1)
+ sy = ch / max(up_h, 1)
+ for w in psm7_words:
+ w['left'] = int(w['left'] * sx) + cx
+ w['top'] = int(w['top'] * sy) + cy
+ w['width'] = int(w['width'] * sx)
+ w['height'] = int(w['height'] * sy)
+ else:
+ for w in psm7_words:
+ w['left'] += cx
+ w['top'] += cy
+ words = psm7_words
+
+ # --- Noise filter ---
+ if text.strip():
+ pre_filter = text
+ text = _clean_cell_text_lite(text)
+ if not text:
+ logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+ row_idx, col_idx, pre_filter)
+ avg_conf = 0.0
+
+ result = dict(empty_cell)
+ result['text'] = text
+ result['confidence'] = avg_conf
+ result['ocr_engine'] = used_engine
+
+ # Store individual word bounding boxes (absolute image coordinates)
+ # for pixel-accurate overlay positioning in the frontend.
+ if words and text.strip():
+ result['word_boxes'] = [
+ {
+ 'text': w.get('text', ''),
+ 'left': w['left'],
+ 'top': w['top'],
+ 'width': w['width'],
+ 'height': w['height'],
+ 'conf': w.get('conf', 0),
+ }
+ for w in words
+ if w.get('text', '').strip()
+ ]
+
+ return result
+
+
+# Threshold: columns narrower than this (% of image width) use single-cell
+# crop OCR instead of full-page word assignment.
+_NARROW_COL_THRESHOLD_PCT = 15.0
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2 — hybrid grid builder (current default)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+ skip_heal_gaps: bool = False,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
+
+ Drop-in replacement for build_cell_grid() -- same signature & return type.
+
+ Strategy:
+ - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
+ words (from row.words). Handles IPA brackets, punctuation, sentence
+ continuity correctly.
+ - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
+ neighbour bleeding from adjacent broad columns.
+ """
+ engine_name = "tesseract"
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
+ engine_name = "rapid"
+
+ logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
+
+ # Filter to content rows only
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ logger.warning("build_cell_grid_v2: no content rows found")
+ return [], []
+
+ # Filter phantom rows (word_count=0) and artifact rows
+ before = len(content_rows)
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ skipped = before - len(content_rows)
+ if skipped > 0:
+ logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
+ if not content_rows:
+ logger.warning("build_cell_grid_v2: no content rows with words found")
+ return [], []
+
+ before_art = len(content_rows)
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ artifact_skipped = before_art - len(content_rows)
+ if artifact_skipped > 0:
+ logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
+ if not content_rows:
+ logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
+ return [], []
+
+ # Filter columns
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+ 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ logger.warning("build_cell_grid_v2: no usable columns found")
+ return [], []
+
+ # Heal row gaps -- use header/footer boundaries
+ content_rows.sort(key=lambda r: r.y)
+ header_rows = [r for r in row_geometries if r.row_type == 'header']
+ footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+ if header_rows:
+ top_bound = max(r.y + r.height for r in header_rows)
+ else:
+ top_bound = content_rows[0].y
+ if footer_rows:
+ bottom_bound = min(r.y for r in footer_rows)
+ else:
+ bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+ # skip_heal_gaps: When True, keep cell positions at their exact row geometry
+ # positions without expanding to fill gaps from removed rows.
+ if not skip_heal_gaps:
+ _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+ relevant_cols.sort(key=lambda c: c.x)
+
+ columns_meta = [
+ {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+ for ci, c in enumerate(relevant_cols)
+ ]
+
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ # --- Classify columns as broad vs narrow ---
+ narrow_col_indices = set()
+ for ci, col in enumerate(relevant_cols):
+ col_pct = (col.width / img_w * 100) if img_w > 0 else 0
+ if col_pct < _NARROW_COL_THRESHOLD_PCT:
+ narrow_col_indices.add(ci)
+
+ broad_col_count = len(relevant_cols) - len(narrow_col_indices)
+ logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
+ f"{len(narrow_col_indices)} narrow columns (cell-crop)")
+
+ # --- Phase 1: Broad columns via full-page word assignment ---
+ cells: List[Dict[str, Any]] = []
+
+ for row_idx, row in enumerate(content_rows):
+ # Assign full-page words to columns for this row
+ col_words = _assign_row_words_to_columns(row, relevant_cols)
+
+ for col_idx, col in enumerate(relevant_cols):
+ if col_idx not in narrow_col_indices:
+ # BROAD column: use pre-assigned full-page words
+ words = col_words.get(col_idx, [])
+ # Filter low-confidence words
+ words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+ # Single full-width column (box sub-session): preserve spacing
+ is_single_full_column = (
+ len(relevant_cols) == 1
+ and img_w > 0
+ and relevant_cols[0].width / img_w > 0.9
+ )
+
+ if words:
+ y_tol = max(15, row.height)
+ if is_single_full_column:
+ text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
+ logger.info(f"R{row_idx:02d}: {len(words)} words, "
+ f"text={text!r:.100}")
+ else:
+ text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+ avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+ else:
+ text = ''
+ avg_conf = 0.0
+ if is_single_full_column:
+ logger.info(f"R{row_idx:02d}: 0 words (row has "
+ f"{row.word_count} total, y={row.y}..{row.y+row.height})")
+
+ # Apply noise filter -- but NOT for single-column sub-sessions
+ if not is_single_full_column:
+ text = _clean_cell_text(text)
+
+ cell = {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': text,
+ 'confidence': avg_conf,
+ 'bbox_px': {
+ 'x': col.x, 'y': row.y,
+ 'w': col.width, 'h': row.height,
+ },
+ 'bbox_pct': {
+ 'x': round(col.x / img_w * 100, 2) if img_w else 0,
+ 'y': round(row.y / img_h * 100, 2) if img_h else 0,
+ 'w': round(col.width / img_w * 100, 2) if img_w else 0,
+ 'h': round(row.height / img_h * 100, 2) if img_h else 0,
+ },
+ 'ocr_engine': 'word_lookup',
+ 'is_bold': False,
+ }
+ # Store word bounding boxes for pixel-accurate overlay
+ if words and text.strip():
+ cell['word_boxes'] = [
+ {
+ 'text': w.get('text', ''),
+ 'left': w['left'],
+ 'top': w['top'],
+ 'width': w['width'],
+ 'height': w['height'],
+ 'conf': w.get('conf', 0),
+ }
+ for w in words
+ if w.get('text', '').strip()
+ ]
+ cells.append(cell)
+
+ # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
+ narrow_tasks = []
+ for row_idx, row in enumerate(content_rows):
+ for col_idx, col in enumerate(relevant_cols):
+ if col_idx in narrow_col_indices:
+ narrow_tasks.append((row_idx, col_idx, row, col))
+
+ if narrow_tasks:
+ max_workers = 4 if engine_name == "tesseract" else 2
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
+ futures = {
+ pool.submit(
+ _ocr_cell_crop,
+ ri, ci, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ engine_name, lang, lang_map,
+ ): (ri, ci)
+ for ri, ci, row, col in narrow_tasks
+ }
+ for future in as_completed(futures):
+ try:
+ cell = future.result()
+ cells.append(cell)
+ except Exception as e:
+ ri, ci = futures[future]
+ logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
+
+ # Sort cells by (row_index, col_index)
+ cells.sort(key=lambda c: (c['row_index'], c['col_index']))
+
+ # Remove all-empty rows
+ rows_with_text: set = set()
+ for cell in cells:
+ if cell['text'].strip():
+ rows_with_text.add(cell['row_index'])
+ before_filter = len(cells)
+ cells = [c for c in cells if c['row_index'] in rows_with_text]
+ empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+ if empty_rows_removed > 0:
+ logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
+
+ logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
+ f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+ f"engine={engine_name} (hybrid)")
+
+ return cells, columns_meta
diff --git a/klausur-service/backend/cv_cell_grid_helpers.py b/klausur-service/backend/cv_cell_grid_helpers.py
new file mode 100644
index 0000000..f5e41d3
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_helpers.py
@@ -0,0 +1,136 @@
+"""
+Shared helpers for cell-grid construction (v2 + legacy).
+
+Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
+cv_cell_grid_legacy.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List
+
+import numpy as np
+
+from cv_vocab_types import RowGeometry
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+# Minimum OCR word confidence to keep (used across multiple functions)
+_MIN_WORD_CONF = 30
+
+
+def _compute_cell_padding(col_width: int, img_w: int) -> int:
+ """Adaptive padding for OCR crops based on column width.
+
+ Narrow columns (page_ref, marker) need more surrounding context so
+ Tesseract can segment characters correctly. Wide columns keep the
+ minimal 4 px padding to avoid pulling in neighbours.
+ """
+ col_pct = col_width / img_w * 100 if img_w > 0 else 100
+ if col_pct < 5:
+ return max(20, col_width // 2)
+ if col_pct < 10:
+ return max(12, col_width // 4)
+ if col_pct < 15:
+ return 8
+ return 4
+
+
+def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
+ max_scale: int = 3) -> np.ndarray:
+ """Upscale tiny crops so Tesseract gets enough pixel data.
+
+ If either dimension is below *min_dim*, the crop is bicubic-upscaled
+ so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
+ """
+ h, w = crop.shape[:2]
+ if h >= min_dim and w >= min_dim:
+ return crop
+ scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
+ if scale <= 1.0:
+ return crop
+ new_w = int(w * scale)
+ new_h = int(h * scale)
+ return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+
+def _select_psm_for_column(col_type: str, col_width: int,
+ row_height: int) -> int:
+ """Choose the best Tesseract PSM for a given column geometry.
+
+ - page_ref columns are almost always single short tokens -> PSM 8
+ - Very narrow or short cells -> PSM 7 (single text line)
+ - Everything else -> PSM 6 (uniform block)
+ """
+ if col_type in ('page_ref', 'marker'):
+ return 8 # single word
+ if col_width < 100 or row_height < 30:
+ return 7 # single line
+ return 6 # uniform block
+
+
+def _is_artifact_row(row: RowGeometry) -> bool:
+ """Return True if this row contains only scan artifacts, not real text.
+
+ Artifact rows (scanner shadows, noise) typically produce only single-character
+ detections. A real content row always has at least one token with 2+ characters.
+ """
+ if row.word_count == 0:
+ return True
+ texts = [w.get('text', '').strip() for w in row.words]
+ return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+ rows: List[RowGeometry],
+ top_bound: int,
+ bottom_bound: int,
+) -> None:
+ """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+ After filtering out empty or artifact rows, remaining content rows may have
+ gaps between them where the removed rows used to be. This function mutates
+ each row to extend upward/downward to the midpoint of such gaps so that
+ OCR crops cover the full available content area.
+
+ The first row always extends to top_bound; the last row to bottom_bound.
+ """
+ if not rows:
+ return
+ rows.sort(key=lambda r: r.y)
+ n = len(rows)
+ orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
+
+ for i, row in enumerate(rows):
+ # New top: midpoint between previous row's bottom and this row's top
+ if i == 0:
+ new_top = top_bound
+ else:
+ prev_bot = orig[i - 1][1]
+ my_top = orig[i][0]
+ gap = my_top - prev_bot
+ new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+ # New bottom: midpoint between this row's bottom and next row's top
+ if i == n - 1:
+ new_bottom = bottom_bound
+ else:
+ my_bot = orig[i][1]
+ next_top = orig[i + 1][0]
+ gap = next_top - my_bot
+ new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+ row.y = new_top
+ row.height = max(5, new_bottom - new_top)
+
+ logger.debug(
+ f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+ f"(bounds: top={top_bound}, bottom={bottom_bound})"
+ )
diff --git a/klausur-service/backend/cv_cell_grid_legacy.py b/klausur-service/backend/cv_cell_grid_legacy.py
new file mode 100644
index 0000000..e00df7c
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_legacy.py
@@ -0,0 +1,436 @@
+"""
+Legacy cell-grid construction (v1) -- DEPRECATED, kept for backward compat.
+
+Extracted from cv_cell_grid.py. Prefer build_cell_grid_v2 for new code.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+ RAPIDOCR_AVAILABLE,
+ _assign_row_words_to_columns,
+ _clean_cell_text,
+ _words_to_reading_order_text,
+ ocr_region_lighton,
+ ocr_region_rapid,
+ ocr_region_trocr,
+)
+from cv_cell_grid_helpers import (
+ _MIN_WORD_CONF,
+ _compute_cell_padding,
+ _ensure_minimum_crop_size,
+ _heal_row_gaps,
+ _is_artifact_row,
+ _select_psm_for_column,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# _ocr_single_cell — legacy per-cell OCR with multi-level fallback
+# ---------------------------------------------------------------------------
+
+def _ocr_single_cell(
+ row_idx: int,
+ col_idx: int,
+ row: RowGeometry,
+ col: PageRegion,
+ ocr_img: np.ndarray,
+ img_bgr: Optional[np.ndarray],
+ img_w: int,
+ img_h: int,
+ use_rapid: bool,
+ engine_name: str,
+ lang: str,
+ lang_map: Dict[str, str],
+ preassigned_words: Optional[List[Dict]] = None,
+) -> Dict[str, Any]:
+ """Populate a single cell (column x row intersection) via word lookup."""
+ # Display bbox: exact column x row intersection (no padding)
+ disp_x = col.x
+ disp_y = row.y
+ disp_w = col.width
+ disp_h = row.height
+
+ # OCR crop: adaptive padding -- narrow columns get more context
+ pad = _compute_cell_padding(col.width, img_w)
+ cell_x = max(0, col.x - pad)
+ cell_y = max(0, row.y - pad)
+ cell_w = min(col.width + 2 * pad, img_w - cell_x)
+ cell_h = min(row.height + 2 * pad, img_h - cell_y)
+ is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
+
+ if disp_w <= 0 or disp_h <= 0:
+ return {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': '',
+ 'confidence': 0.0,
+ 'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
+ 'bbox_pct': {
+ 'x': round(col.x / img_w * 100, 2),
+ 'y': round(row.y / img_h * 100, 2),
+ 'w': round(col.width / img_w * 100, 2),
+ 'h': round(row.height / img_h * 100, 2),
+ },
+ 'ocr_engine': 'word_lookup',
+ }
+
+ # --- PRIMARY: Word-lookup from full-page Tesseract ---
+ words = preassigned_words if preassigned_words is not None else []
+ used_engine = 'word_lookup'
+
+ # Filter low-confidence words
+ if words:
+ words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+ if words:
+ y_tol = max(15, row.height)
+ text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+ avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+ else:
+ text = ''
+ avg_conf = 0.0
+
+ # --- FALLBACK: Cell-OCR for empty cells ---
+ _run_fallback = False
+ if not text.strip() and cell_w > 0 and cell_h > 0:
+ if ocr_img is not None:
+ crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+ if crop.size > 0:
+ dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+ _run_fallback = dark_ratio > 0.005
+ if _run_fallback:
+ # For narrow columns, upscale the crop before OCR
+ if is_narrow and ocr_img is not None:
+ _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+ _upscaled = _ensure_minimum_crop_size(_crop_slice)
+ if _upscaled is not _crop_slice:
+ _up_h, _up_w = _upscaled.shape[:2]
+ _tmp_region = PageRegion(
+ type=col.type, x=0, y=0, width=_up_w, height=_up_h,
+ )
+ _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+ cell_lang = lang_map.get(col.type, lang)
+ fallback_words = ocr_region(_upscaled, _tmp_region,
+ lang=cell_lang, psm=_cell_psm)
+ # Remap word positions back to original image coordinates
+ _sx = cell_w / max(_up_w, 1)
+ _sy = cell_h / max(_up_h, 1)
+ for _fw in (fallback_words or []):
+ _fw['left'] = int(_fw['left'] * _sx) + cell_x
+ _fw['top'] = int(_fw['top'] * _sy) + cell_y
+ _fw['width'] = int(_fw['width'] * _sx)
+ _fw['height'] = int(_fw['height'] * _sy)
+ else:
+ cell_region = PageRegion(
+ type=col.type, x=cell_x, y=cell_y,
+ width=cell_w, height=cell_h,
+ )
+ _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+ cell_lang = lang_map.get(col.type, lang)
+ fallback_words = ocr_region(ocr_img, cell_region,
+ lang=cell_lang, psm=_cell_psm)
+ else:
+ cell_region = PageRegion(
+ type=col.type,
+ x=cell_x, y=cell_y,
+ width=cell_w, height=cell_h,
+ )
+ if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+ fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
+ elif engine_name == "lighton" and img_bgr is not None:
+ fallback_words = ocr_region_lighton(img_bgr, cell_region)
+ elif use_rapid and img_bgr is not None:
+ fallback_words = ocr_region_rapid(img_bgr, cell_region)
+ else:
+ _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+ cell_lang = lang_map.get(col.type, lang)
+ fallback_words = ocr_region(ocr_img, cell_region,
+ lang=cell_lang, psm=_cell_psm)
+
+ if fallback_words:
+ fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if fallback_words:
+ fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
+ fb_y_tol = max(10, int(fb_avg_h * 0.5))
+ fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
+ if fb_text.strip():
+ text = fb_text
+ avg_conf = round(
+ sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
+ )
+ used_engine = 'cell_ocr_fallback'
+
+ # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+ if not text.strip() and _run_fallback and not use_rapid:
+ _fb_region = PageRegion(
+ type=col.type, x=cell_x, y=cell_y,
+ width=cell_w, height=cell_h,
+ )
+ cell_lang = lang_map.get(col.type, lang)
+ psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
+ if psm7_words:
+ psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if psm7_words:
+ p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+ if p7_text.strip():
+ text = p7_text
+ avg_conf = round(
+ sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+ )
+ used_engine = 'cell_ocr_psm7'
+
+ # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
+ if not text.strip() and is_narrow and img_bgr is not None:
+ row_region = PageRegion(
+ type='_row_strip', x=0, y=row.y,
+ width=img_w, height=row.height,
+ )
+ strip_words = ocr_region_rapid(img_bgr, row_region)
+ if strip_words:
+ col_left = col.x
+ col_right = col.x + col.width
+ col_words = []
+ for sw in strip_words:
+ sw_left = sw.get('left', 0)
+ sw_right = sw_left + sw.get('width', 0)
+ overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
+ if overlap > sw.get('width', 1) * 0.3:
+ col_words.append(sw)
+ if col_words:
+ col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if col_words:
+ rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
+ if rs_text.strip():
+ text = rs_text
+ avg_conf = round(
+ sum(w['conf'] for w in col_words) / len(col_words), 1
+ )
+ used_engine = 'row_strip_rapid'
+
+ # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+ if text.strip():
+ text = _clean_cell_text(text)
+ if not text:
+ avg_conf = 0.0
+
+ return {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': text,
+ 'confidence': avg_conf,
+ 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+ 'bbox_pct': {
+ 'x': round(disp_x / img_w * 100, 2),
+ 'y': round(disp_y / img_h * 100, 2),
+ 'w': round(disp_w / img_w * 100, 2),
+ 'h': round(disp_h / img_h * 100, 2),
+ },
+ 'ocr_engine': used_engine,
+ }
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid — legacy grid builder (DEPRECATED)
+# ---------------------------------------------------------------------------
+
+def build_cell_grid(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """Generic Cell-Grid: Columns x Rows -> cells with OCR text.
+
+ DEPRECATED: Use build_cell_grid_v2 instead.
+ """
+ # Resolve engine choice
+ use_rapid = False
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "auto":
+ use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+ engine_name = "rapid" if use_rapid else "tesseract"
+ elif ocr_engine == "rapid":
+ if not RAPIDOCR_AVAILABLE:
+ logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+ else:
+ use_rapid = True
+ engine_name = "rapid" if use_rapid else "tesseract"
+ else:
+ engine_name = "tesseract"
+
+ logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
+
+ # Filter to content rows only (skip header/footer)
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ logger.warning("build_cell_grid: no content rows found")
+ return [], []
+
+ before = len(content_rows)
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ skipped = before - len(content_rows)
+ if skipped > 0:
+ logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
+ if not content_rows:
+ logger.warning("build_cell_grid: no content rows with words found")
+ return [], []
+
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ logger.warning("build_cell_grid: no usable columns found")
+ return [], []
+
+ before_art = len(content_rows)
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ artifact_skipped = before_art - len(content_rows)
+ if artifact_skipped > 0:
+ logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
+ if not content_rows:
+ logger.warning("build_cell_grid: no content rows after artifact filtering")
+ return [], []
+
+ _heal_row_gaps(
+ content_rows,
+ top_bound=min(c.y for c in relevant_cols),
+ bottom_bound=max(c.y + c.height for c in relevant_cols),
+ )
+
+ relevant_cols.sort(key=lambda c: c.x)
+
+ columns_meta = [
+ {
+ 'index': col_idx,
+ 'type': col.type,
+ 'x': col.x,
+ 'width': col.width,
+ }
+ for col_idx, col in enumerate(relevant_cols)
+ ]
+
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ cells: List[Dict[str, Any]] = []
+
+ for row_idx, row in enumerate(content_rows):
+ col_words = _assign_row_words_to_columns(row, relevant_cols)
+ for col_idx, col in enumerate(relevant_cols):
+ cell = _ocr_single_cell(
+ row_idx, col_idx, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ use_rapid, engine_name, lang, lang_map,
+ preassigned_words=col_words[col_idx],
+ )
+ cells.append(cell)
+
+ # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+ empty_by_col: Dict[int, List[int]] = {}
+ for ci, cell in enumerate(cells):
+ if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+ bpx = cell['bbox_px']
+ x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+ if w > 0 and h > 0 and ocr_img is not None:
+ crop = ocr_img[y:y + h, x:x + w]
+ if crop.size > 0:
+ dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+ if dark_ratio > 0.005:
+ empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+ for col_idx, cell_indices in empty_by_col.items():
+ if len(cell_indices) < 3:
+ continue
+
+ min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+ max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+ col_x = cells[cell_indices[0]]['bbox_px']['x']
+ col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+ strip_region = PageRegion(
+ type=relevant_cols[col_idx].type,
+ x=col_x, y=min_y,
+ width=col_w, height=max_y_h - min_y,
+ )
+ strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+ if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+ strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
+ elif engine_name == "lighton" and img_bgr is not None:
+ strip_words = ocr_region_lighton(img_bgr, strip_region)
+ elif use_rapid and img_bgr is not None:
+ strip_words = ocr_region_rapid(img_bgr, strip_region)
+ else:
+ strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+ if not strip_words:
+ continue
+
+ strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+ if not strip_words:
+ continue
+
+ for ci in cell_indices:
+ cell_y = cells[ci]['bbox_px']['y']
+ cell_h = cells[ci]['bbox_px']['h']
+ cell_mid_y = cell_y + cell_h / 2
+
+ matched_words = [
+ w for w in strip_words
+ if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+ ]
+ if matched_words:
+ matched_words.sort(key=lambda w: w['left'])
+ batch_text = ' '.join(w['text'] for w in matched_words)
+ batch_text = _clean_cell_text(batch_text)
+ if batch_text.strip():
+ cells[ci]['text'] = batch_text
+ cells[ci]['confidence'] = round(
+ sum(w['conf'] for w in matched_words) / len(matched_words), 1
+ )
+ cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+ batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+ if batch_filled > 0:
+ logger.info(
+ f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+ f"empty cells in column {col_idx}"
+ )
+
+ # Remove all-empty rows
+ rows_with_text: set = set()
+ for cell in cells:
+ if cell['text'].strip():
+ rows_with_text.add(cell['row_index'])
+ before_filter = len(cells)
+ cells = [c for c in cells if c['row_index'] in rows_with_text]
+ empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+ if empty_rows_removed > 0:
+ logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
+
+ logger.info(f"build_cell_grid: {len(cells)} cells from "
+ f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
+ f"engine={engine_name}")
+
+ return cells, columns_meta
diff --git a/klausur-service/backend/cv_cell_grid_merge.py b/klausur-service/backend/cv_cell_grid_merge.py
new file mode 100644
index 0000000..a86770e
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_merge.py
@@ -0,0 +1,235 @@
+"""
+Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_ocr_engines import _RE_ALPHA
+
+logger = logging.getLogger(__name__)
+
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+ r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+ """Check if text consists only of phonetic transcription.
+
+ Phonetic-only patterns:
+ ['mani serva] -> True
+ [dance] -> True
+ ["a:mand] -> True
+ almond ['a:mand] -> False (has real word before bracket)
+ Mandel -> False
+ """
+ t = text.strip()
+ if not t:
+ return False
+ # Must contain at least one bracket
+ if '[' not in t and ']' not in t:
+ return False
+ # Remove all bracket content and surrounding punctuation/whitespace
+ without_brackets = re.sub(r"\[.*?\]", '', t)
+ without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+ # If nothing meaningful remains, it's phonetic-only
+ alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+ return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+ entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Merge rows that contain only phonetic transcription into previous entry.
+
+ In dictionary pages, phonetic transcription sometimes wraps to the next
+ row. E.g.:
+ Row 28: EN="it's a money-saver" DE="es spart Kosten"
+ Row 29: EN="['mani serva]" DE=""
+
+ Row 29 is phonetic-only -> merge into row 28's EN field.
+ """
+ if len(entries) < 2:
+ return entries
+
+ merged: List[Dict[str, Any]] = []
+ for entry in entries:
+ en = (entry.get('english') or '').strip()
+ de = (entry.get('german') or '').strip()
+ ex = (entry.get('example') or '').strip()
+
+ # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+ if merged and _is_phonetic_only_text(en) and not de:
+ prev = merged[-1]
+ prev_en = (prev.get('english') or '').strip()
+ # Append phonetic to previous entry's EN
+ if prev_en:
+ prev['english'] = prev_en + ' ' + en
+ else:
+ prev['english'] = en
+ # If there was an example, append to previous too
+ if ex:
+ prev_ex = (prev.get('example') or '').strip()
+ prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+ logger.debug(
+ f"Merged phonetic row {entry.get('row_index')} "
+ f"into previous entry: {prev['english']!r}"
+ )
+ continue
+
+ merged.append(entry)
+
+ return merged
+
+
+def _merge_wrapped_rows(
+ entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Merge rows where the primary column (EN) is empty -- cell wrap continuation.
+
+ In textbook vocabulary tables, columns are often narrow, so the author
+ wraps text within a cell. OCR treats each physical line as a separate row.
+ The key indicator: if the EN column is empty but DE/example have text,
+ this row is a continuation of the previous row's cells.
+
+ Example (original textbook has ONE row):
+ Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took"
+ Row 3: EN="" DE="(bei)" EX="part in the concert."
+ -> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
+
+ Also handles the reverse case: DE empty but EN has text (wrap in EN column).
+ """
+ if len(entries) < 2:
+ return entries
+
+ merged: List[Dict[str, Any]] = []
+ for entry in entries:
+ en = (entry.get('english') or '').strip()
+ de = (entry.get('german') or '').strip()
+ ex = (entry.get('example') or '').strip()
+
+ if not merged:
+ merged.append(entry)
+ continue
+
+ prev = merged[-1]
+ prev_en = (prev.get('english') or '').strip()
+ prev_de = (prev.get('german') or '').strip()
+ prev_ex = (prev.get('example') or '').strip()
+
+ # Case 1: EN is empty -> continuation of previous row
+ if not en and (de or ex) and prev_en:
+ if de:
+ if prev_de.endswith(','):
+ sep = ' '
+ elif prev_de.endswith(('-', '(')):
+ sep = ''
+ else:
+ sep = ' '
+ prev['german'] = (prev_de + sep + de).strip()
+ if ex:
+ sep = ' ' if prev_ex else ''
+ prev['example'] = (prev_ex + sep + ex).strip()
+ logger.debug(
+ f"Merged wrapped row {entry.get('row_index')} into previous "
+ f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
+ )
+ continue
+
+ # Case 2: DE is empty, EN has text that looks like continuation
+ if en and not de and prev_de:
+ is_paren = en.startswith('(')
+ first_alpha = next((c for c in en if c.isalpha()), '')
+ starts_lower = first_alpha and first_alpha.islower()
+
+ if (is_paren or starts_lower) and len(en.split()) < 5:
+ sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
+ prev['english'] = (prev_en + sep + en).strip()
+ if ex:
+ sep2 = ' ' if prev_ex else ''
+ prev['example'] = (prev_ex + sep2 + ex).strip()
+ logger.debug(
+ f"Merged wrapped row {entry.get('row_index')} into previous "
+ f"(empty DE): EN={prev['english']!r}"
+ )
+ continue
+
+ merged.append(entry)
+
+ if len(merged) < len(entries):
+ logger.info(
+ f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
+ f"continuation rows ({len(entries)} -> {len(merged)})"
+ )
+ return merged
+
+
+def _merge_continuation_rows(
+ entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Merge multi-line vocabulary entries where text wraps to the next row.
+
+ A row is a continuation of the previous entry when:
+ - EN has text, but DE is empty
+ - EN starts with a lowercase letter (not a new vocab entry)
+ - Previous entry's EN does NOT end with a sentence terminator (.!?)
+ - The continuation text has fewer than 4 words (not an example sentence)
+ - The row was not already merged as phonetic
+
+ Example:
+ Row 5: EN="to put up" DE="aufstellen"
+ Row 6: EN="with sth." DE=""
+ -> Merged: EN="to put up with sth." DE="aufstellen"
+ """
+ if len(entries) < 2:
+ return entries
+
+ merged: List[Dict[str, Any]] = []
+ for entry in entries:
+ en = (entry.get('english') or '').strip()
+ de = (entry.get('german') or '').strip()
+
+ if merged and en and not de:
+ # Check: not phonetic (already handled)
+ if _is_phonetic_only_text(en):
+ merged.append(entry)
+ continue
+
+ # Check: starts with lowercase
+ first_alpha = next((c for c in en if c.isalpha()), '')
+ starts_lower = first_alpha and first_alpha.islower()
+
+ # Check: fewer than 4 words (not an example sentence)
+ word_count = len(en.split())
+ is_short = word_count < 4
+
+ # Check: previous entry doesn't end with sentence terminator
+ prev = merged[-1]
+ prev_en = (prev.get('english') or '').strip()
+ prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+ if starts_lower and is_short and not prev_ends_sentence:
+ # Merge into previous entry
+ prev['english'] = (prev_en + ' ' + en).strip()
+ # Merge example if present
+ ex = (entry.get('example') or '').strip()
+ if ex:
+ prev_ex = (prev.get('example') or '').strip()
+ prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+ logger.debug(
+ f"Merged continuation row {entry.get('row_index')} "
+ f"into previous entry: {prev['english']!r}"
+ )
+ continue
+
+ merged.append(entry)
+
+ return merged
diff --git a/klausur-service/backend/cv_cell_grid_streaming.py b/klausur-service/backend/cv_cell_grid_streaming.py
new file mode 100644
index 0000000..4db3268
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_streaming.py
@@ -0,0 +1,217 @@
+"""
+Streaming variants of cell-grid builders (v2 + legacy).
+
+Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
+useful for progress reporting.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+ RAPIDOCR_AVAILABLE,
+ _assign_row_words_to_columns,
+)
+from cv_cell_grid_helpers import (
+ _heal_row_gaps,
+ _is_artifact_row,
+)
+from cv_cell_grid_build import _ocr_cell_crop
+from cv_cell_grid_legacy import _ocr_single_cell
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_v2_streaming
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_v2_streaming(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+ """Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.
+
+ Yields:
+ (cell_dict, columns_meta, total_cells)
+ """
+ use_rapid = False
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "auto":
+ engine_name = "tesseract"
+ elif ocr_engine == "rapid":
+ if not RAPIDOCR_AVAILABLE:
+ logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+ else:
+ use_rapid = True
+ engine_name = "rapid" if use_rapid else "tesseract"
+ else:
+ engine_name = "tesseract"
+
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ return
+
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ if not content_rows:
+ return
+
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+ 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ return
+
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ if not content_rows:
+ return
+
+ # Use header/footer boundaries for heal_row_gaps
+ content_rows.sort(key=lambda r: r.y)
+ header_rows = [r for r in row_geometries if r.row_type == 'header']
+ footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+ if header_rows:
+ top_bound = max(r.y + r.height for r in header_rows)
+ else:
+ top_bound = content_rows[0].y
+ if footer_rows:
+ bottom_bound = min(r.y for r in footer_rows)
+ else:
+ bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+ _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+ relevant_cols.sort(key=lambda c: c.x)
+
+ columns_meta = [
+ {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+ for ci, c in enumerate(relevant_cols)
+ ]
+
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ total_cells = len(content_rows) * len(relevant_cols)
+
+ for row_idx, row in enumerate(content_rows):
+ for col_idx, col in enumerate(relevant_cols):
+ cell = _ocr_cell_crop(
+ row_idx, col_idx, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ engine_name, lang, lang_map,
+ )
+ yield cell, columns_meta, total_cells
+
+
+# ---------------------------------------------------------------------------
+# build_cell_grid_streaming — legacy streaming variant
+# ---------------------------------------------------------------------------
+
+def build_cell_grid_streaming(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+ """Like build_cell_grid(), but yields each cell as it is OCR'd.
+
+ DEPRECATED: Use build_cell_grid_v2_streaming instead.
+
+ Yields:
+ (cell_dict, columns_meta, total_cells) for each cell.
+ """
+ use_rapid = False
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "auto":
+ use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+ engine_name = "rapid" if use_rapid else "tesseract"
+ elif ocr_engine == "rapid":
+ if not RAPIDOCR_AVAILABLE:
+ logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+ else:
+ use_rapid = True
+ engine_name = "rapid" if use_rapid else "tesseract"
+ else:
+ engine_name = "tesseract"
+
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ return
+
+ before = len(content_rows)
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ skipped = before - len(content_rows)
+ if skipped > 0:
+ logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
+ if not content_rows:
+ return
+
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ return
+
+ before_art = len(content_rows)
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ artifact_skipped = before_art - len(content_rows)
+ if artifact_skipped > 0:
+ logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
+ if not content_rows:
+ return
+ _heal_row_gaps(
+ content_rows,
+ top_bound=min(c.y for c in relevant_cols),
+ bottom_bound=max(c.y + c.height for c in relevant_cols),
+ )
+
+ relevant_cols.sort(key=lambda c: c.x)
+
+ columns_meta = [
+ {
+ 'index': col_idx,
+ 'type': col.type,
+ 'x': col.x,
+ 'width': col.width,
+ }
+ for col_idx, col in enumerate(relevant_cols)
+ ]
+
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ total_cells = len(content_rows) * len(relevant_cols)
+
+ for row_idx, row in enumerate(content_rows):
+ col_words = _assign_row_words_to_columns(row, relevant_cols)
+ for col_idx, col in enumerate(relevant_cols):
+ cell = _ocr_single_cell(
+ row_idx, col_idx, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ use_rapid, engine_name, lang, lang_map,
+ preassigned_words=col_words[col_idx],
+ )
+ yield cell, columns_meta, total_cells
diff --git a/klausur-service/backend/cv_cell_grid_vocab.py b/klausur-service/backend/cv_cell_grid_vocab.py
new file mode 100644
index 0000000..d475c33
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid_vocab.py
@@ -0,0 +1,200 @@
+"""
+Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List
+
+from cv_ocr_engines import (
+ _attach_example_sentences,
+ _fix_phonetic_brackets,
+ _split_comma_entries,
+)
+from cv_cell_grid_legacy import build_cell_grid
+from cv_cell_grid_merge import (
+ _merge_continuation_rows,
+ _merge_phonetic_continuation_rows,
+ _merge_wrapped_rows,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _cells_to_vocab_entries(
+ cells: List[Dict[str, Any]],
+ columns_meta: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Map generic cells to vocab entries with english/german/example fields.
+
+ Groups cells by row_index, maps col_type -> field name, and produces
+ one entry per row (only rows with at least one non-empty field).
+ """
+ col_type_to_field = {
+ 'column_en': 'english',
+ 'column_de': 'german',
+ 'column_example': 'example',
+ 'page_ref': 'source_page',
+ 'column_marker': 'marker',
+ 'column_text': 'text', # generic single-column (box sub-sessions)
+ }
+ bbox_key_map = {
+ 'column_en': 'bbox_en',
+ 'column_de': 'bbox_de',
+ 'column_example': 'bbox_ex',
+ 'page_ref': 'bbox_ref',
+ 'column_marker': 'bbox_marker',
+ 'column_text': 'bbox_text',
+ }
+
+ # Group cells by row_index
+ rows: Dict[int, List[Dict]] = {}
+ for cell in cells:
+ ri = cell['row_index']
+ rows.setdefault(ri, []).append(cell)
+
+ entries: List[Dict[str, Any]] = []
+ for row_idx in sorted(rows.keys()):
+ row_cells = rows[row_idx]
+ entry: Dict[str, Any] = {
+ 'row_index': row_idx,
+ 'english': '',
+ 'german': '',
+ 'example': '',
+ 'text': '', # generic single-column (box sub-sessions)
+ 'source_page': '',
+ 'marker': '',
+ 'confidence': 0.0,
+ 'bbox': None,
+ 'bbox_en': None,
+ 'bbox_de': None,
+ 'bbox_ex': None,
+ 'bbox_ref': None,
+ 'bbox_marker': None,
+ 'bbox_text': None,
+ 'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
+ }
+
+ confidences = []
+ for cell in row_cells:
+ col_type = cell['col_type']
+ field = col_type_to_field.get(col_type)
+ if field:
+ entry[field] = cell['text']
+ bbox_field = bbox_key_map.get(col_type)
+ if bbox_field:
+ entry[bbox_field] = cell['bbox_pct']
+ if cell['confidence'] > 0:
+ confidences.append(cell['confidence'])
+
+ # Compute row-level bbox as union of all cell bboxes
+ all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
+ if all_bboxes:
+ min_x = min(b['x'] for b in all_bboxes)
+ min_y = min(b['y'] for b in all_bboxes)
+ max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
+ max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
+ entry['bbox'] = {
+ 'x': round(min_x, 2),
+ 'y': round(min_y, 2),
+ 'w': round(max_x2 - min_x, 2),
+ 'h': round(max_y2 - min_y, 2),
+ }
+
+ entry['confidence'] = round(
+ sum(confidences) / len(confidences), 1
+ ) if confidences else 0.0
+
+ # Only include if at least one mapped field has text
+ has_content = any(
+ entry.get(f)
+ for f in col_type_to_field.values()
+ )
+ if has_content:
+ entries.append(entry)
+
+ return entries
+
+
+def build_word_grid(
+ ocr_img,
+ column_regions,
+ row_geometries,
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr=None,
+ pronunciation: str = "british",
+) -> List[Dict[str, Any]]:
+ """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
+
+ Wrapper around build_cell_grid() that adds vocabulary-specific logic:
+ - Maps cells to english/german/example entries
+ - Applies character confusion fixes, IPA lookup, comma splitting, etc.
+ - Falls back to returning raw cells if no vocab columns detected.
+
+ Args:
+ ocr_img: Binarized full-page image (for Tesseract).
+ column_regions: Classified columns from Step 3.
+ row_geometries: Rows from Step 4.
+ img_w, img_h: Image dimensions.
+ lang: Default Tesseract language.
+ ocr_engine: 'tesseract', 'rapid', or 'auto'.
+ img_bgr: BGR color image (required for RapidOCR).
+ pronunciation: 'british' or 'american' for IPA lookup.
+
+ Returns:
+ List of entry dicts with english/german/example text and bbox info (percent).
+ """
+ cells, columns_meta = build_cell_grid(
+ ocr_img, column_regions, row_geometries, img_w, img_h,
+ lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
+ )
+
+ if not cells:
+ return []
+
+ # Check if vocab layout is present
+ col_types = {c['type'] for c in columns_meta}
+ if not (col_types & {'column_en', 'column_de'}):
+ logger.info("build_word_grid: no vocab columns -- returning raw cells")
+ return cells
+
+ # Vocab mapping: cells -> entries
+ entries = _cells_to_vocab_entries(cells, columns_meta)
+
+ # --- Post-processing pipeline (deterministic, no LLM) ---
+ n_raw = len(entries)
+
+ # 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
+ entries = _merge_wrapped_rows(entries)
+
+ # 0a. Merge phonetic-only continuation rows into previous entry
+ entries = _merge_phonetic_continuation_rows(entries)
+
+ # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+ entries = _merge_continuation_rows(entries)
+
+ # 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
+ # llm_review_entries_streaming so changes are visible to the user in Step 6.
+
+ # 2. Replace OCR'd phonetics with dictionary IPA
+ entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+
+ # 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
+ entries = _split_comma_entries(entries)
+
+ # 4. Attach example sentences (rows without DE -> examples for preceding entry)
+ entries = _attach_example_sentences(entries)
+
+ engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
+ logger.info(f"build_word_grid: {len(entries)} entries from "
+ f"{n_raw} raw -> {len(entries)} after post-processing "
+ f"(engine={engine_name})")
+
+ return entries
diff --git a/klausur-service/backend/cv_preprocessing.py b/klausur-service/backend/cv_preprocessing.py
index 71c4f50..0cb2841 100644
--- a/klausur-service/backend/cv_preprocessing.py
+++ b/klausur-service/backend/cv_preprocessing.py
@@ -1,14 +1,19 @@
"""
Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
+Re-export facade -- all logic lives in the sub-modules:
+
+ cv_preprocessing_deskew Rotation correction (Hough, word-alignment, iterative, two-pass)
+ cv_preprocessing_dewarp Vertical shear detection and correction (4 methods + ensemble)
+
+This file contains the image I/O and orientation detection functions.
+
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
-import time
-from collections import defaultdict
-from typing import Any, Dict, List, Tuple
+from typing import Tuple
import numpy as np
@@ -19,7 +24,7 @@ from cv_vocab_types import (
logger = logging.getLogger(__name__)
-# Guarded imports — mirror cv_vocab_types guards
+# Guarded imports
try:
import cv2
except ImportError:
@@ -32,6 +37,33 @@ except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
+# Re-export all deskew functions
+from cv_preprocessing_deskew import ( # noqa: F401
+ deskew_image,
+ deskew_image_by_word_alignment,
+ deskew_image_iterative,
+ deskew_two_pass,
+ _projection_gradient_score,
+ _measure_textline_slope,
+)
+
+# Re-export all dewarp functions
+from cv_preprocessing_dewarp import ( # noqa: F401
+ _apply_shear,
+ _detect_shear_angle,
+ _detect_shear_by_hough,
+ _detect_shear_by_projection,
+ _detect_shear_by_text_lines,
+ _dewarp_quality_check,
+ _ensemble_shear,
+ dewarp_image,
+ dewarp_image_manual,
+)
+
+
+# =============================================================================
+# Image I/O
+# =============================================================================
def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
"""Render a PDF page to a high-resolution numpy array (BGR).
@@ -54,7 +86,6 @@ def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
- # Convert to numpy BGR
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
if pix.n == 4: # RGBA
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
@@ -84,23 +115,19 @@ def render_image_high_res(image_data: bytes) -> np.ndarray:
# =============================================================================
-# Stage 1b: Orientation Detection (0°/90°/180°/270°)
+# Orientation Detection (0/90/180/270)
# =============================================================================
def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
"""Detect page orientation via Tesseract OSD and rotate if needed.
- Handles upside-down scans (180°) common with book scanners where
- every other page is flipped due to the scanner hinge.
-
Returns:
- (corrected_image, rotation_degrees) — rotation is 0, 90, 180, or 270.
+ (corrected_image, rotation_degrees) -- rotation is 0, 90, 180, or 270.
"""
if pytesseract is None:
return img_bgr, 0
try:
- # Tesseract OSD needs a grayscale or RGB image
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
pil_img = Image.fromarray(gray)
@@ -108,12 +135,11 @@ def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
rotate = osd.get("rotate", 0)
confidence = osd.get("orientation_conf", 0.0)
- logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}")
+ logger.info(f"OSD: orientation={rotate}\u00b0 confidence={confidence:.1f}")
if rotate == 0 or confidence < 1.0:
return img_bgr, 0
- # Apply rotation — OSD rotate is the clockwise correction needed
if rotate == 180:
corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
elif rotate == 90:
@@ -123,1044 +149,9 @@ def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
else:
return img_bgr, 0
- logger.info(f"OSD: rotated {rotate}° to fix orientation")
+ logger.info(f"OSD: rotated {rotate}\u00b0 to fix orientation")
return corrected, rotate
except Exception as e:
logger.warning(f"OSD orientation detection failed: {e}")
return img_bgr, 0
-
-
-# =============================================================================
-# Stage 2: Deskew (Rotation Correction)
-# =============================================================================
-
-def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
- """Correct rotation using Hough Line detection.
-
- Args:
- img: BGR image.
-
- Returns:
- Tuple of (corrected image, detected angle in degrees).
- """
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- # Binarize for line detection
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
- # Detect lines
- lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
- minLineLength=img.shape[1] // 4, maxLineGap=20)
-
- if lines is None or len(lines) < 3:
- return img, 0.0
-
- # Compute angles of near-horizontal lines
- angles = []
- for line in lines:
- x1, y1, x2, y2 = line[0]
- angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
- if abs(angle) < 15: # Only near-horizontal
- angles.append(angle)
-
- if not angles:
- return img, 0.0
-
- median_angle = float(np.median(angles))
-
- # Limit correction to ±5°
- if abs(median_angle) > 5.0:
- median_angle = 5.0 * np.sign(median_angle)
-
- if abs(median_angle) < 0.1:
- return img, 0.0
-
- # Rotate
- h, w = img.shape[:2]
- center = (w // 2, h // 2)
- M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
- corrected = cv2.warpAffine(img, M, (w, h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
-
- logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
- return corrected, median_angle
-
-
-def deskew_image_by_word_alignment(
- image_data: bytes,
- lang: str = "eng+deu",
- downscale_factor: float = 0.5,
-) -> Tuple[bytes, float]:
- """Correct rotation by fitting a line through left-most word starts per text line.
-
- More robust than Hough-based deskew for vocabulary worksheets where text lines
- have consistent left-alignment. Runs a quick Tesseract pass on a downscaled
- copy to find word positions, computes the dominant left-edge column, fits a
- line through those points and rotates the full-resolution image.
-
- Args:
- image_data: Raw image bytes (PNG/JPEG).
- lang: Tesseract language string for the quick pass.
- downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
-
- Returns:
- Tuple of (rotated image as PNG bytes, detected angle in degrees).
- """
- if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
- return image_data, 0.0
-
- # 1. Decode image
- img_array = np.frombuffer(image_data, dtype=np.uint8)
- img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
- if img is None:
- logger.warning("deskew_by_word_alignment: could not decode image")
- return image_data, 0.0
-
- orig_h, orig_w = img.shape[:2]
-
- # 2. Downscale for fast Tesseract pass
- small_w = int(orig_w * downscale_factor)
- small_h = int(orig_h * downscale_factor)
- small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
-
- # 3. Quick Tesseract — word-level positions
- pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
- try:
- data = pytesseract.image_to_data(
- pil_small, lang=lang, config="--psm 6 --oem 3",
- output_type=pytesseract.Output.DICT,
- )
- except Exception as e:
- logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
- return image_data, 0.0
-
- # 4. Per text-line, find the left-most word start
- # Group by (block_num, par_num, line_num)
- line_groups: Dict[tuple, list] = defaultdict(list)
- for i in range(len(data["text"])):
- text = (data["text"][i] or "").strip()
- conf = int(data["conf"][i])
- if not text or conf < 20:
- continue
- key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
- line_groups[key].append(i)
-
- if len(line_groups) < 5:
- logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
- return image_data, 0.0
-
- # For each line, pick the word with smallest 'left' → compute (left_x, center_y)
- # Scale back to original resolution
- scale = 1.0 / downscale_factor
- points = [] # list of (x, y) in original-image coords
- for key, indices in line_groups.items():
- best_idx = min(indices, key=lambda i: data["left"][i])
- lx = data["left"][best_idx] * scale
- top = data["top"][best_idx] * scale
- h = data["height"][best_idx] * scale
- cy = top + h / 2.0
- points.append((lx, cy))
-
- # 5. Find dominant left-edge column + compute angle
- xs = np.array([p[0] for p in points])
- ys = np.array([p[1] for p in points])
- median_x = float(np.median(xs))
- tolerance = orig_w * 0.03 # 3% of image width
-
- mask = np.abs(xs - median_x) <= tolerance
- filtered_xs = xs[mask]
- filtered_ys = ys[mask]
-
- if len(filtered_xs) < 5:
- logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
- return image_data, 0.0
-
- # polyfit: x = a*y + b → a = dx/dy → angle = arctan(a)
- coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
- slope = coeffs[0] # dx/dy
- angle_rad = np.arctan(slope)
- angle_deg = float(np.degrees(angle_rad))
-
- # Clamp to ±5°
- angle_deg = max(-5.0, min(5.0, angle_deg))
-
- logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
- f"(total lines: {len(line_groups)})")
-
- if abs(angle_deg) < 0.05:
- return image_data, 0.0
-
- # 6. Rotate full-res image
- center = (orig_w // 2, orig_h // 2)
- M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
- rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
-
- # Encode back to PNG
- success, png_buf = cv2.imencode(".png", rotated)
- if not success:
- logger.warning("deskew_by_word_alignment: PNG encoding failed")
- return image_data, 0.0
-
- return png_buf.tobytes(), angle_deg
-
-
-def _projection_gradient_score(profile: np.ndarray) -> float:
- """Score a projection profile by the L2-norm of its first derivative.
-
- Higher score = sharper transitions between text-lines and gaps,
- i.e. better row/column alignment.
- """
- diff = np.diff(profile)
- return float(np.sum(diff * diff))
-
-
-def deskew_image_iterative(
- img: np.ndarray,
- coarse_range: float = 5.0,
- coarse_step: float = 0.1,
- fine_range: float = 0.15,
- fine_step: float = 0.02,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
- """Iterative deskew using vertical-edge projection optimisation.
-
- The key insight: at the correct rotation angle, vertical features
- (word left-edges, column borders) become truly vertical, producing
- the sharpest peaks in the vertical projection of vertical edges.
-
- Method:
- 1. Detect vertical edges via Sobel-X on the central crop.
- 2. Coarse sweep: rotate edge image, compute vertical projection
- gradient score. The angle where vertical edges align best wins.
- 3. Fine sweep: refine around the coarse winner.
-
- Args:
- img: BGR image (full resolution).
- coarse_range: half-range in degrees for the coarse sweep.
- coarse_step: step size in degrees for the coarse sweep.
- fine_range: half-range around the coarse winner for the fine sweep.
- fine_step: step size in degrees for the fine sweep.
-
- Returns:
- (rotated_bgr, angle_degrees, debug_dict)
- """
- h, w = img.shape[:2]
- debug: Dict[str, Any] = {}
-
- # --- Grayscale + vertical edge detection ---
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Central crop (15%-85% height, 10%-90% width) to avoid page margins
- y_lo, y_hi = int(h * 0.15), int(h * 0.85)
- x_lo, x_hi = int(w * 0.10), int(w * 0.90)
- gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
-
- # Sobel-X → absolute vertical edges
- sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
- edges = np.abs(sobel_x)
- # Normalise to 0-255 for consistent scoring
- edge_max = edges.max()
- if edge_max > 0:
- edges = (edges / edge_max * 255).astype(np.uint8)
- else:
- return img, 0.0, {"error": "no edges detected"}
-
- crop_h, crop_w = edges.shape[:2]
- crop_center = (crop_w // 2, crop_h // 2)
-
- # Trim margin after rotation to avoid border artifacts
- trim_y = max(4, int(crop_h * 0.03))
- trim_x = max(4, int(crop_w * 0.03))
-
- def _sweep_edges(angles: np.ndarray) -> list:
- """Score each angle by vertical projection gradient of vertical edges."""
- results = []
- for angle in angles:
- if abs(angle) < 1e-6:
- rotated = edges
- else:
- M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
- rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
- flags=cv2.INTER_NEAREST,
- borderMode=cv2.BORDER_REPLICATE)
- # Trim borders to avoid edge artifacts
- trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
- v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
- score = _projection_gradient_score(v_profile)
- results.append((float(angle), score))
- return results
-
- # --- Phase 1: coarse sweep ---
- coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
- coarse_results = _sweep_edges(coarse_angles)
- best_coarse = max(coarse_results, key=lambda x: x[1])
- best_coarse_angle, best_coarse_score = best_coarse
-
- debug["coarse_best_angle"] = round(best_coarse_angle, 2)
- debug["coarse_best_score"] = round(best_coarse_score, 1)
- debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
-
- # --- Phase 2: fine sweep around coarse winner ---
- fine_lo = best_coarse_angle - fine_range
- fine_hi = best_coarse_angle + fine_range
- fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
- fine_results = _sweep_edges(fine_angles)
- best_fine = max(fine_results, key=lambda x: x[1])
- best_fine_angle, best_fine_score = best_fine
-
- debug["fine_best_angle"] = round(best_fine_angle, 2)
- debug["fine_best_score"] = round(best_fine_score, 1)
- debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
-
- final_angle = best_fine_angle
-
- # Clamp to ±5°
- final_angle = max(-5.0, min(5.0, final_angle))
-
- logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°")
-
- if abs(final_angle) < 0.05:
- return img, 0.0, debug
-
- # --- Rotate full-res image ---
- center = (w // 2, h // 2)
- M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
- rotated = cv2.warpAffine(img, M, (w, h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
-
- return rotated, final_angle, debug
-
-
-def _measure_textline_slope(img: np.ndarray) -> float:
- """Measure residual text-line slope via Tesseract word-position regression.
-
- Groups Tesseract words by (block, par, line), fits a linear regression
- per line (y = slope * x + b), and returns the trimmed-mean slope in
- degrees. Positive = text rises to the right, negative = falls.
-
- This is the most direct measurement of remaining rotation after deskew.
- """
- import math as _math
-
- if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
- return 0.0
-
- h, w = img.shape[:2]
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- data = pytesseract.image_to_data(
- Image.fromarray(gray),
- output_type=pytesseract.Output.DICT,
- config="--psm 6",
- )
-
- # Group word centres by text line
- lines: Dict[tuple, list] = {}
- for i in range(len(data["text"])):
- txt = (data["text"][i] or "").strip()
- if len(txt) < 2 or int(data["conf"][i]) < 30:
- continue
- key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
- cx = data["left"][i] + data["width"][i] / 2.0
- cy = data["top"][i] + data["height"][i] / 2.0
- lines.setdefault(key, []).append((cx, cy))
-
- # Per-line linear regression → slope angle
- slopes: list = []
- for pts in lines.values():
- if len(pts) < 3:
- continue
- pts.sort(key=lambda p: p[0])
- xs = np.array([p[0] for p in pts], dtype=np.float64)
- ys = np.array([p[1] for p in pts], dtype=np.float64)
- if xs[-1] - xs[0] < w * 0.15:
- continue # skip short lines
- A = np.vstack([xs, np.ones_like(xs)]).T
- result = np.linalg.lstsq(A, ys, rcond=None)
- slope = result[0][0]
- slopes.append(_math.degrees(_math.atan(slope)))
-
- if len(slopes) < 3:
- return 0.0
-
- # Trimmed mean (drop 10% extremes on each side)
- slopes.sort()
- trim = max(1, len(slopes) // 10)
- trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
- if not trimmed:
- return 0.0
-
- return sum(trimmed) / len(trimmed)
-
-
-def deskew_two_pass(
- img: np.ndarray,
- coarse_range: float = 5.0,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
- """Two-pass deskew: iterative projection + word-alignment residual check.
-
- Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range).
- Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image
- to detect and fix residual skew that the projection method missed.
-
- The two corrections are summed. If the residual from Pass 2 is below
- 0.3° it is ignored (already good enough).
-
- Returns:
- (corrected_bgr, total_angle_degrees, debug_dict)
- """
- debug: Dict[str, Any] = {}
-
- # --- Pass 1: iterative projection ---
- corrected, angle1, dbg1 = deskew_image_iterative(
- img.copy(), coarse_range=coarse_range,
- )
- debug["pass1_angle"] = round(angle1, 3)
- debug["pass1_method"] = "iterative"
- debug["pass1_debug"] = dbg1
-
- # --- Pass 2: word-alignment residual check on corrected image ---
- angle2 = 0.0
- try:
- # Encode the corrected image to PNG bytes for word-alignment
- ok, buf = cv2.imencode(".png", corrected)
- if ok:
- corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
- if abs(angle2) >= 0.3:
- # Significant residual — decode and use the second correction
- arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
- corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
- if corrected2 is not None:
- corrected = corrected2
- logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied "
- f"(total={angle1 + angle2:.2f}°)")
- else:
- angle2 = 0.0
- else:
- logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped")
- angle2 = 0.0
- except Exception as e:
- logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
- angle2 = 0.0
-
- # --- Pass 3: Tesseract text-line regression residual check ---
- # The most reliable final check: measure actual text-line slopes
- # using Tesseract word positions and linear regression per line.
- angle3 = 0.0
- try:
- residual = _measure_textline_slope(corrected)
- debug["pass3_raw"] = round(residual, 3)
- if abs(residual) >= 0.3:
- h3, w3 = corrected.shape[:2]
- center3 = (w3 // 2, h3 // 2)
- M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
- corrected = cv2.warpAffine(
- corrected, M3, (w3, h3),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE,
- )
- angle3 = residual
- logger.info(
- "deskew_two_pass: pass3 text-line residual=%.2f° applied",
- residual,
- )
- else:
- logger.info(
- "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped",
- residual,
- )
- except Exception as e:
- logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
-
- total_angle = angle1 + angle2 + angle3
- debug["pass2_angle"] = round(angle2, 3)
- debug["pass2_method"] = "word_alignment"
- debug["pass3_angle"] = round(angle3, 3)
- debug["pass3_method"] = "textline_regression"
- debug["total_angle"] = round(total_angle, 3)
-
- logger.info(
- "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°",
- angle1, angle2, angle3, total_angle,
- )
-
- return corrected, total_angle, debug
-
-
-# =============================================================================
-# Stage 3: Dewarp (Book Curvature Correction)
-# =============================================================================
-
-def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
- """Detect the vertical shear angle of the page.
-
- After deskew (horizontal lines aligned), vertical features like column
- edges may still be tilted. This measures that tilt by tracking the
- strongest vertical edge across horizontal strips.
-
- The result is a shear angle in degrees: the angular difference between
- true vertical and the detected column edge.
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- h, w = img.shape[:2]
- result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
-
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Vertical Sobel to find vertical edges
- sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
- abs_sobel = np.abs(sobel_x).astype(np.uint8)
-
- # Binarize with Otsu
- _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-
- num_strips = 20
- strip_h = h // num_strips
- edge_positions = [] # (y_center, x_position)
-
- for i in range(num_strips):
- y_start = i * strip_h
- y_end = min((i + 1) * strip_h, h)
- strip = binary[y_start:y_end, :]
-
- # Project vertically (sum along y-axis)
- projection = np.sum(strip, axis=0).astype(np.float64)
- if projection.max() == 0:
- continue
-
- # Find the strongest vertical edge in left 40% of image
- search_w = int(w * 0.4)
- left_proj = projection[:search_w]
- if left_proj.max() == 0:
- continue
-
- # Smooth and find peak
- kernel_size = max(3, w // 100)
- if kernel_size % 2 == 0:
- kernel_size += 1
- smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
- x_pos = float(np.argmax(smoothed))
- y_center = (y_start + y_end) / 2.0
- edge_positions.append((y_center, x_pos))
-
- if len(edge_positions) < 8:
- return result
-
- ys = np.array([p[0] for p in edge_positions])
- xs = np.array([p[1] for p in edge_positions])
-
- # Remove outliers (> 2 std from median)
- median_x = np.median(xs)
- std_x = max(np.std(xs), 1.0)
- mask = np.abs(xs - median_x) < 2 * std_x
- ys = ys[mask]
- xs = xs[mask]
-
- if len(ys) < 6:
- return result
-
- # Fit straight line: x = slope * y + intercept
- # The slope tells us the tilt of the vertical edge
- straight_coeffs = np.polyfit(ys, xs, 1)
- slope = straight_coeffs[0] # dx/dy in pixels
- fitted = np.polyval(straight_coeffs, ys)
- residuals = xs - fitted
- rmse = float(np.sqrt(np.mean(residuals ** 2)))
-
- # Convert slope to angle: arctan(dx/dy) in degrees
- import math
- shear_degrees = math.degrees(math.atan(slope))
-
- confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
-
- result["shear_degrees"] = round(shear_degrees, 3)
- result["confidence"] = round(float(confidence), 2)
-
- return result
-
-
-def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
- """Detect shear angle by maximising variance of horizontal text-line projections.
-
- Principle: horizontal text lines produce a row-projection profile with sharp
- peaks (high variance) when the image is correctly aligned. Any residual shear
- smears the peaks and reduces variance. We sweep ±3° and pick the angle whose
- corrected projection has the highest variance.
-
- Works best on pages with clear horizontal banding (vocabulary tables, prose).
- Complements _detect_shear_angle() which needs strong vertical edges.
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- import math
- result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
-
- h, w = img.shape[:2]
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Otsu binarisation
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
- # Work at half resolution for speed
- small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
- sh, sw = small.shape
-
- # 2-pass angle sweep for 10x better precision:
- # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values)
- # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values)
-
- def _sweep_variance(angles_list):
- results = []
- for angle_deg in angles_list:
- if abs(angle_deg) < 0.001:
- rotated = small
- else:
- shear_tan = math.tan(math.radians(angle_deg))
- M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
- rotated = cv2.warpAffine(small, M, (sw, sh),
- flags=cv2.INTER_NEAREST,
- borderMode=cv2.BORDER_CONSTANT)
- profile = np.sum(rotated, axis=1).astype(float)
- results.append((angle_deg, float(np.var(profile))))
- return results
-
- # Pass 1: coarse
- coarse_angles = [a * 0.5 for a in range(-6, 7)] # 13 values
- coarse_results = _sweep_variance(coarse_angles)
- coarse_best = max(coarse_results, key=lambda x: x[1])
-
- # Pass 2: fine around coarse best
- fine_center = coarse_best[0]
- fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)] # 21 values
- fine_results = _sweep_variance(fine_angles)
- fine_best = max(fine_results, key=lambda x: x[1])
-
- best_angle = fine_best[0]
- best_variance = fine_best[1]
- variances = coarse_results + fine_results
-
- # Confidence: how much sharper is the best angle vs. the mean?
- all_mean = sum(v for _, v in variances) / len(variances)
- if all_mean > 0 and best_variance > all_mean:
- confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
- else:
- confidence = 0.0
-
- result["shear_degrees"] = round(best_angle, 3)
- result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
- return result
-
-
-def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
- """Detect shear using Hough transform on printed table / ruled lines.
-
- Vocabulary worksheets have near-horizontal printed table borders. After
- deskew these should be exactly horizontal; any residual tilt equals the
- vertical shear angle (with inverted sign).
-
- The sign convention: a horizontal line tilting +α degrees (left end lower)
- means the page has vertical shear of -α degrees (left column edge drifts
- to the left going downward).
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
- h, w = img.shape[:2]
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- edges = cv2.Canny(gray, 50, 150, apertureSize=3)
-
- min_len = int(w * 0.15)
- lines = cv2.HoughLinesP(
- edges, rho=1, theta=np.pi / 360,
- threshold=int(w * 0.08),
- minLineLength=min_len,
- maxLineGap=20,
- )
-
- if lines is None or len(lines) < 3:
- return result
-
- horizontal_angles: List[Tuple[float, float]] = []
- for line in lines:
- x1, y1, x2, y2 = line[0]
- if x1 == x2:
- continue
- angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
- if abs(angle) <= 5.0:
- length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
- horizontal_angles.append((angle, length))
-
- if len(horizontal_angles) < 3:
- return result
-
- # Weighted median
- angles_arr = np.array([a for a, _ in horizontal_angles])
- weights_arr = np.array([l for _, l in horizontal_angles])
- sorted_idx = np.argsort(angles_arr)
- s_angles = angles_arr[sorted_idx]
- s_weights = weights_arr[sorted_idx]
- cum = np.cumsum(s_weights)
- mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
- median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
-
- agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
- confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
-
- # Sign inversion: horizontal line tilt is complementary to vertical shear
- shear_degrees = -median_angle
-
- result["shear_degrees"] = round(shear_degrees, 3)
- result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
- return result
-
-
-def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
- """Detect shear by measuring text-line straightness (Method D).
-
- Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
- bounding boxes, groups them into vertical columns by X-proximity,
- and measures how the left-edge X position drifts with Y (vertical
- position). The drift dx/dy is the tangent of the shear angle.
-
- This directly measures vertical shear (column tilt) rather than
- horizontal text-line slope, which is already corrected by deskew.
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- import math
- result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
- h, w = img.shape[:2]
- # Downscale 50% for speed
- scale = 0.5
- small = cv2.resize(img, (int(w * scale), int(h * scale)),
- interpolation=cv2.INTER_AREA)
- gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
- pil_img = Image.fromarray(gray)
-
- try:
- data = pytesseract.image_to_data(
- pil_img, lang='eng+deu', config='--psm 11 --oem 3',
- output_type=pytesseract.Output.DICT,
- )
- except Exception:
- return result
-
- # Collect word left-edges (x) and vertical centres (y)
- words = []
- for i in range(len(data['text'])):
- text = data['text'][i].strip()
- conf = int(data['conf'][i])
- if not text or conf < 20 or len(text) < 2:
- continue
- left_x = float(data['left'][i])
- cy = data['top'][i] + data['height'][i] / 2.0
- word_w = float(data['width'][i])
- words.append((left_x, cy, word_w))
-
- if len(words) < 15:
- return result
-
- # --- Group words into vertical columns by left-edge X proximity ---
- # Sort by x, then cluster words whose left-edges are within x_tol
- avg_w = sum(ww for _, _, ww in words) / len(words)
- x_tol = max(avg_w * 0.4, 8) # tolerance for "same column"
-
- words_by_x = sorted(words, key=lambda w: w[0])
- columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...]
- cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
- cur_x = words_by_x[0][0]
-
- for lx, cy, _ in words_by_x[1:]:
- if abs(lx - cur_x) <= x_tol:
- cur_col.append((lx, cy))
- # Update running x as median of cluster
- cur_x = cur_x * 0.8 + lx * 0.2
- else:
- if len(cur_col) >= 5:
- columns.append(cur_col)
- cur_col = [(lx, cy)]
- cur_x = lx
- if len(cur_col) >= 5:
- columns.append(cur_col)
-
- if len(columns) < 2:
- return result
-
- # --- For each column, measure X-drift as a function of Y ---
- # Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle)
- drifts = []
- for col in columns:
- ys = np.array([p[1] for p in col])
- xs = np.array([p[0] for p in col])
- y_range = ys.max() - ys.min()
- if y_range < h * scale * 0.3:
- continue # column must span at least 30% of image height
- # Linear regression: x = a*y + b
- coeffs = np.polyfit(ys, xs, 1)
- drifts.append(coeffs[0]) # dx/dy
-
- if len(drifts) < 2:
- return result
-
- # Median dx/dy → shear angle
- # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
- median_drift = float(np.median(drifts))
- shear_degrees = math.degrees(math.atan(median_drift))
-
- # Confidence from column count + drift consistency
- drift_std = float(np.std(drifts))
- consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance
- count_factor = min(1.0, len(drifts) / 4.0)
- confidence = count_factor * 0.5 + consistency * 0.5
-
- result["shear_degrees"] = round(shear_degrees, 3)
- result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
- logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
- "shear=%.3f°, conf=%.2f",
- len(columns), len(drifts), median_drift,
- shear_degrees, confidence)
- return result
-
-
-def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
- """Check whether the dewarp correction actually improved alignment.
-
- Compares horizontal projection variance before and after correction.
- Higher variance means sharper text-line peaks, which indicates better
- horizontal alignment.
-
- Returns True if the correction improved the image, False if it should
- be discarded.
- """
- def _h_proj_variance(img: np.ndarray) -> float:
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- _, binary = cv2.threshold(gray, 0, 255,
- cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
- small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
- interpolation=cv2.INTER_AREA)
- profile = np.sum(small, axis=1).astype(float)
- return float(np.var(profile))
-
- var_before = _h_proj_variance(original)
- var_after = _h_proj_variance(corrected)
-
- # Correction must improve variance (even by a tiny margin)
- return var_after > var_before
-
-
-def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
- """Apply a vertical shear correction to an image.
-
- Shifts each row horizontally proportional to its distance from the
- vertical center. This corrects the tilt of vertical features (columns)
- without affecting horizontal alignment (text lines).
-
- Args:
- img: BGR image.
- shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
-
- Returns:
- Corrected image.
- """
- import math
- h, w = img.shape[:2]
- shear_tan = math.tan(math.radians(shear_degrees))
-
- # Affine matrix: shift x by shear_tan * (y - h/2)
- # [1 shear_tan -h/2*shear_tan]
- # [0 1 0 ]
- M = np.float32([
- [1, shear_tan, -h / 2.0 * shear_tan],
- [0, 1, 0],
- ])
-
- corrected = cv2.warpAffine(img, M, (w, h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
- return corrected
-
-
-def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
- """Combine multiple shear detections into a single weighted estimate (v2).
-
- Ensemble v2 changes vs v1:
- - Minimum confidence raised to 0.5 (was 0.3)
- - text_lines method gets 1.5× weight boost (most reliable detector)
- - Outlier filter at 1° from weighted mean
-
- Returns:
- (shear_degrees, ensemble_confidence, methods_used_str)
- """
- # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
- # that individual methods detect with moderate confidence.
- _MIN_CONF = 0.35
-
- # text_lines gets a weight boost as the most content-aware method
- _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
-
- accepted = []
- for d in detections:
- if d["confidence"] < _MIN_CONF:
- continue
- boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
- effective_conf = d["confidence"] * boost
- accepted.append((d["shear_degrees"], effective_conf, d["method"]))
-
- if not accepted:
- return 0.0, 0.0, "none"
-
- if len(accepted) == 1:
- deg, conf, method = accepted[0]
- return deg, min(conf, 1.0), method
-
- # First pass: weighted mean
- total_w = sum(c for _, c, _ in accepted)
- w_mean = sum(d * c for d, c, _ in accepted) / total_w
-
- # Outlier filter: keep results within 1° of weighted mean
- filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
- if not filtered:
- filtered = accepted # fallback: keep all
-
- # Second pass: weighted mean on filtered results
- total_w2 = sum(c for _, c, _ in filtered)
- final_deg = sum(d * c for d, c, _ in filtered) / total_w2
-
- # Ensemble confidence: average of individual confidences, boosted when
- # methods agree (all within 0.5° of each other)
- avg_conf = total_w2 / len(filtered)
- spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
- agreement_bonus = 0.15 if spread < 0.5 else 0.0
- ensemble_conf = min(1.0, avg_conf + agreement_bonus)
-
- methods_str = "+".join(m for _, _, m in filtered)
- return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
-
-
-def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
- """Correct vertical shear after deskew (v2 with quality gate).
-
- After deskew aligns horizontal text lines, vertical features (column
- edges) may still be tilted. This detects the tilt angle using an ensemble
- of four complementary methods and applies an affine shear correction.
-
- Methods (all run in ~150ms total):
- A. _detect_shear_angle() — vertical edge profile (~50ms)
- B. _detect_shear_by_projection() — horizontal text-line variance (~30ms)
- C. _detect_shear_by_hough() — Hough lines on table borders (~20ms)
- D. _detect_shear_by_text_lines() — text-line straightness (~50ms)
-
- Quality gate: after correction, horizontal projection variance is compared
- before vs after. If correction worsened alignment, it is discarded.
-
- Args:
- img: BGR image (already deskewed).
- use_ensemble: If False, fall back to single-method behaviour (method A only).
-
- Returns:
- Tuple of (corrected_image, dewarp_info).
- dewarp_info keys: method, shear_degrees, confidence, detections.
- """
- no_correction = {
- "method": "none",
- "shear_degrees": 0.0,
- "confidence": 0.0,
- "detections": [],
- }
-
- if not CV2_AVAILABLE:
- return img, no_correction
-
- t0 = time.time()
-
- if use_ensemble:
- det_a = _detect_shear_angle(img)
- det_b = _detect_shear_by_projection(img)
- det_c = _detect_shear_by_hough(img)
- det_d = _detect_shear_by_text_lines(img)
- detections = [det_a, det_b, det_c, det_d]
- shear_deg, confidence, method = _ensemble_shear(detections)
- else:
- det_a = _detect_shear_angle(img)
- detections = [det_a]
- shear_deg = det_a["shear_degrees"]
- confidence = det_a["confidence"]
- method = det_a["method"]
-
- duration = time.time() - t0
-
- logger.info(
- "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
- "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
- shear_deg, confidence, method, duration,
- detections[0]["shear_degrees"], detections[0]["confidence"],
- detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
- detections[1]["confidence"] if len(detections) > 1 else 0.0,
- detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
- detections[2]["confidence"] if len(detections) > 2 else 0.0,
- detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
- detections[3]["confidence"] if len(detections) > 3 else 0.0,
- )
-
- # Always include individual detections (even when no correction applied)
- _all_detections = [
- {"method": d["method"], "shear_degrees": d["shear_degrees"],
- "confidence": d["confidence"]}
- for d in detections
- ]
-
- # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
- # For ensemble confidence, require at least 0.4 (lowered from 0.5 to
- # catch moderate-confidence detections from multiple agreeing methods).
- if abs(shear_deg) < 0.08 or confidence < 0.4:
- no_correction["detections"] = _all_detections
- return img, no_correction
-
- # Apply correction (negate the detected shear to straighten)
- corrected = _apply_shear(img, -shear_deg)
-
- # Quality gate: verify the correction actually improved alignment.
- # For small corrections (< 0.5°), the projection variance change can be
- # negligible, so we skip the quality gate — the cost of a tiny wrong
- # correction is much less than the cost of leaving 0.4° uncorrected
- # (which shifts content ~25px at image edges on tall scans).
- if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
- logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
- "projection variance did not improve", shear_deg)
- no_correction["detections"] = _all_detections
- return img, no_correction
-
- info = {
- "method": method,
- "shear_degrees": shear_deg,
- "confidence": confidence,
- "detections": _all_detections,
- }
-
- return corrected, info
-
-
-def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
- """Apply shear correction with a manual angle.
-
- Args:
- img: BGR image (deskewed, before dewarp).
- shear_degrees: Shear angle in degrees to correct.
-
- Returns:
- Corrected image.
- """
- if abs(shear_degrees) < 0.001:
- return img
- return _apply_shear(img, -shear_degrees)
-
diff --git a/klausur-service/backend/cv_preprocessing_deskew.py b/klausur-service/backend/cv_preprocessing_deskew.py
new file mode 100644
index 0000000..1bdb27e
--- /dev/null
+++ b/klausur-service/backend/cv_preprocessing_deskew.py
@@ -0,0 +1,437 @@
+"""
+CV Preprocessing Deskew — Rotation correction via Hough lines, word alignment, and iterative projection.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from collections import defaultdict
+from typing import Any, Dict, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+ CV2_AVAILABLE,
+ TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+try:
+ import pytesseract
+ from PIL import Image
+except ImportError:
+ pytesseract = None # type: ignore[assignment]
+ Image = None # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Deskew via Hough Lines
+# =============================================================================
+
+def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
+ """Correct rotation using Hough Line detection.
+
+ Args:
+ img: BGR image.
+
+ Returns:
+ Tuple of (corrected image, detected angle in degrees).
+ """
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+ lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
+ minLineLength=img.shape[1] // 4, maxLineGap=20)
+
+ if lines is None or len(lines) < 3:
+ return img, 0.0
+
+ angles = []
+ for line in lines:
+ x1, y1, x2, y2 = line[0]
+ angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+ if abs(angle) < 15:
+ angles.append(angle)
+
+ if not angles:
+ return img, 0.0
+
+ median_angle = float(np.median(angles))
+
+ if abs(median_angle) > 5.0:
+ median_angle = 5.0 * np.sign(median_angle)
+
+ if abs(median_angle) < 0.1:
+ return img, 0.0
+
+ h, w = img.shape[:2]
+ center = (w // 2, h // 2)
+ M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+ corrected = cv2.warpAffine(img, M, (w, h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+
+ logger.info(f"Deskew: corrected {median_angle:.2f}\u00b0 rotation")
+ return corrected, median_angle
+
+
+# =============================================================================
+# Deskew via Word Alignment
+# =============================================================================
+
+def deskew_image_by_word_alignment(
+ image_data: bytes,
+ lang: str = "eng+deu",
+ downscale_factor: float = 0.5,
+) -> Tuple[bytes, float]:
+ """Correct rotation by fitting a line through left-most word starts per text line.
+
+ More robust than Hough-based deskew for vocabulary worksheets where text lines
+ have consistent left-alignment.
+
+ Args:
+ image_data: Raw image bytes (PNG/JPEG).
+ lang: Tesseract language string for the quick pass.
+ downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
+
+ Returns:
+ Tuple of (rotated image as PNG bytes, detected angle in degrees).
+ """
+ if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
+ return image_data, 0.0
+
+ img_array = np.frombuffer(image_data, dtype=np.uint8)
+ img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+ if img is None:
+ logger.warning("deskew_by_word_alignment: could not decode image")
+ return image_data, 0.0
+
+ orig_h, orig_w = img.shape[:2]
+
+ small_w = int(orig_w * downscale_factor)
+ small_h = int(orig_h * downscale_factor)
+ small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
+
+ pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
+ try:
+ data = pytesseract.image_to_data(
+ pil_small, lang=lang, config="--psm 6 --oem 3",
+ output_type=pytesseract.Output.DICT,
+ )
+ except Exception as e:
+ logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
+ return image_data, 0.0
+
+ line_groups: Dict[tuple, list] = defaultdict(list)
+ for i in range(len(data["text"])):
+ text = (data["text"][i] or "").strip()
+ conf = int(data["conf"][i])
+ if not text or conf < 20:
+ continue
+ key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+ line_groups[key].append(i)
+
+ if len(line_groups) < 5:
+ logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
+ return image_data, 0.0
+
+ scale = 1.0 / downscale_factor
+ points = []
+ for key, indices in line_groups.items():
+ best_idx = min(indices, key=lambda i: data["left"][i])
+ lx = data["left"][best_idx] * scale
+ top = data["top"][best_idx] * scale
+ h = data["height"][best_idx] * scale
+ cy = top + h / 2.0
+ points.append((lx, cy))
+
+ xs = np.array([p[0] for p in points])
+ ys = np.array([p[1] for p in points])
+ median_x = float(np.median(xs))
+ tolerance = orig_w * 0.03
+
+ mask = np.abs(xs - median_x) <= tolerance
+ filtered_xs = xs[mask]
+ filtered_ys = ys[mask]
+
+ if len(filtered_xs) < 5:
+ logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
+ return image_data, 0.0
+
+ coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
+ slope = coeffs[0]
+ angle_rad = np.arctan(slope)
+ angle_deg = float(np.degrees(angle_rad))
+
+ angle_deg = max(-5.0, min(5.0, angle_deg))
+
+ logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}\u00b0 from {len(filtered_xs)} points "
+ f"(total lines: {len(line_groups)})")
+
+ if abs(angle_deg) < 0.05:
+ return image_data, 0.0
+
+ center = (orig_w // 2, orig_h // 2)
+ M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+ rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+
+ success, png_buf = cv2.imencode(".png", rotated)
+ if not success:
+ logger.warning("deskew_by_word_alignment: PNG encoding failed")
+ return image_data, 0.0
+
+ return png_buf.tobytes(), angle_deg
+
+
+# =============================================================================
+# Projection Gradient Scoring
+# =============================================================================
+
+def _projection_gradient_score(profile: np.ndarray) -> float:
+ """Score a projection profile by the L2-norm of its first derivative."""
+ diff = np.diff(profile)
+ return float(np.sum(diff * diff))
+
+
+# =============================================================================
+# Iterative Deskew (Vertical-Edge Projection)
+# =============================================================================
+
+def deskew_image_iterative(
+ img: np.ndarray,
+ coarse_range: float = 5.0,
+ coarse_step: float = 0.1,
+ fine_range: float = 0.15,
+ fine_step: float = 0.02,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+ """Iterative deskew using vertical-edge projection optimisation.
+
+ Args:
+ img: BGR image (full resolution).
+ coarse_range: half-range in degrees for the coarse sweep.
+ coarse_step: step size in degrees for the coarse sweep.
+ fine_range: half-range around the coarse winner for the fine sweep.
+ fine_step: step size in degrees for the fine sweep.
+
+ Returns:
+ (rotated_bgr, angle_degrees, debug_dict)
+ """
+ h, w = img.shape[:2]
+ debug: Dict[str, Any] = {}
+
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ y_lo, y_hi = int(h * 0.15), int(h * 0.85)
+ x_lo, x_hi = int(w * 0.10), int(w * 0.90)
+ gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
+
+ sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
+ edges = np.abs(sobel_x)
+ edge_max = edges.max()
+ if edge_max > 0:
+ edges = (edges / edge_max * 255).astype(np.uint8)
+ else:
+ return img, 0.0, {"error": "no edges detected"}
+
+ crop_h, crop_w = edges.shape[:2]
+ crop_center = (crop_w // 2, crop_h // 2)
+
+ trim_y = max(4, int(crop_h * 0.03))
+ trim_x = max(4, int(crop_w * 0.03))
+
+ def _sweep_edges(angles: np.ndarray) -> list:
+ results = []
+ for angle in angles:
+ if abs(angle) < 1e-6:
+ rotated = edges
+ else:
+ M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
+ rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
+ flags=cv2.INTER_NEAREST,
+ borderMode=cv2.BORDER_REPLICATE)
+ trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
+ v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
+ score = _projection_gradient_score(v_profile)
+ results.append((float(angle), score))
+ return results
+
+ coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
+ coarse_results = _sweep_edges(coarse_angles)
+ best_coarse = max(coarse_results, key=lambda x: x[1])
+ best_coarse_angle, best_coarse_score = best_coarse
+
+ debug["coarse_best_angle"] = round(best_coarse_angle, 2)
+ debug["coarse_best_score"] = round(best_coarse_score, 1)
+ debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
+
+ fine_lo = best_coarse_angle - fine_range
+ fine_hi = best_coarse_angle + fine_range
+ fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
+ fine_results = _sweep_edges(fine_angles)
+ best_fine = max(fine_results, key=lambda x: x[1])
+ best_fine_angle, best_fine_score = best_fine
+
+ debug["fine_best_angle"] = round(best_fine_angle, 2)
+ debug["fine_best_score"] = round(best_fine_score, 1)
+ debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
+
+ final_angle = best_fine_angle
+ final_angle = max(-5.0, min(5.0, final_angle))
+
+ logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}\u00b0 fine={best_fine_angle:.2f}\u00b0 -> {final_angle:.2f}\u00b0")
+
+ if abs(final_angle) < 0.05:
+ return img, 0.0, debug
+
+ center = (w // 2, h // 2)
+ M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
+ rotated = cv2.warpAffine(img, M, (w, h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+
+ return rotated, final_angle, debug
+
+
+# =============================================================================
+# Text-Line Slope Measurement
+# =============================================================================
+
+def _measure_textline_slope(img: np.ndarray) -> float:
+ """Measure residual text-line slope via Tesseract word-position regression."""
+ import math as _math
+
+ if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
+ return 0.0
+
+ h, w = img.shape[:2]
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ data = pytesseract.image_to_data(
+ Image.fromarray(gray),
+ output_type=pytesseract.Output.DICT,
+ config="--psm 6",
+ )
+
+ lines: Dict[tuple, list] = {}
+ for i in range(len(data["text"])):
+ txt = (data["text"][i] or "").strip()
+ if len(txt) < 2 or int(data["conf"][i]) < 30:
+ continue
+ key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+ cx = data["left"][i] + data["width"][i] / 2.0
+ cy = data["top"][i] + data["height"][i] / 2.0
+ lines.setdefault(key, []).append((cx, cy))
+
+ slopes: list = []
+ for pts in lines.values():
+ if len(pts) < 3:
+ continue
+ pts.sort(key=lambda p: p[0])
+ xs = np.array([p[0] for p in pts], dtype=np.float64)
+ ys = np.array([p[1] for p in pts], dtype=np.float64)
+ if xs[-1] - xs[0] < w * 0.15:
+ continue
+ A = np.vstack([xs, np.ones_like(xs)]).T
+ result = np.linalg.lstsq(A, ys, rcond=None)
+ slope = result[0][0]
+ slopes.append(_math.degrees(_math.atan(slope)))
+
+ if len(slopes) < 3:
+ return 0.0
+
+ slopes.sort()
+ trim = max(1, len(slopes) // 10)
+ trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
+ if not trimmed:
+ return 0.0
+
+ return sum(trimmed) / len(trimmed)
+
+
+# =============================================================================
+# Two-Pass Deskew
+# =============================================================================
+
+def deskew_two_pass(
+ img: np.ndarray,
+ coarse_range: float = 5.0,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+ """Two-pass deskew: iterative projection + word-alignment residual check.
+
+ Returns:
+ (corrected_bgr, total_angle_degrees, debug_dict)
+ """
+ debug: Dict[str, Any] = {}
+
+ # --- Pass 1: iterative projection ---
+ corrected, angle1, dbg1 = deskew_image_iterative(
+ img.copy(), coarse_range=coarse_range,
+ )
+ debug["pass1_angle"] = round(angle1, 3)
+ debug["pass1_method"] = "iterative"
+ debug["pass1_debug"] = dbg1
+
+ # --- Pass 2: word-alignment residual check ---
+ angle2 = 0.0
+ try:
+ ok, buf = cv2.imencode(".png", corrected)
+ if ok:
+ corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
+ if abs(angle2) >= 0.3:
+ arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
+ corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
+ if corrected2 is not None:
+ corrected = corrected2
+ logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 applied "
+ f"(total={angle1 + angle2:.2f}\u00b0)")
+ else:
+ angle2 = 0.0
+ else:
+ logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 < 0.3\u00b0 -- skipped")
+ angle2 = 0.0
+ except Exception as e:
+ logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
+ angle2 = 0.0
+
+ # --- Pass 3: Tesseract text-line regression residual check ---
+ angle3 = 0.0
+ try:
+ residual = _measure_textline_slope(corrected)
+ debug["pass3_raw"] = round(residual, 3)
+ if abs(residual) >= 0.3:
+ h3, w3 = corrected.shape[:2]
+ center3 = (w3 // 2, h3 // 2)
+ M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
+ corrected = cv2.warpAffine(
+ corrected, M3, (w3, h3),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE,
+ )
+ angle3 = residual
+ logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 applied", residual)
+ else:
+ logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 < 0.3\u00b0 -- skipped", residual)
+ except Exception as e:
+ logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
+
+ total_angle = angle1 + angle2 + angle3
+ debug["pass2_angle"] = round(angle2, 3)
+ debug["pass2_method"] = "word_alignment"
+ debug["pass3_angle"] = round(angle3, 3)
+ debug["pass3_method"] = "textline_regression"
+ debug["total_angle"] = round(total_angle, 3)
+
+ logger.info(
+ "deskew_two_pass: pass1=%.2f\u00b0 + pass2=%.2f\u00b0 + pass3=%.2f\u00b0 = %.2f\u00b0",
+ angle1, angle2, angle3, total_angle,
+ )
+
+ return corrected, total_angle, debug
diff --git a/klausur-service/backend/cv_preprocessing_dewarp.py b/klausur-service/backend/cv_preprocessing_dewarp.py
new file mode 100644
index 0000000..640c87c
--- /dev/null
+++ b/klausur-service/backend/cv_preprocessing_dewarp.py
@@ -0,0 +1,474 @@
+"""
+CV Preprocessing Dewarp — Vertical shear detection and correction.
+
+Provides four shear detection methods (vertical edge, projection variance,
+Hough lines, text-line drift), ensemble combination, quality gating,
+and the main dewarp_image() function.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import math
+import time
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+ CV2_AVAILABLE,
+ TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+try:
+ import pytesseract
+ from PIL import Image
+except ImportError:
+ pytesseract = None # type: ignore[assignment]
+ Image = None # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Shear Detection Methods
+# =============================================================================
+
+def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
+ """Detect vertical shear angle via strongest vertical edge tracking (Method A)."""
+ h, w = img.shape[:2]
+ result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
+
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+ abs_sobel = np.abs(sobel_x).astype(np.uint8)
+
+ _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+ num_strips = 20
+ strip_h = h // num_strips
+ edge_positions = []
+
+ for i in range(num_strips):
+ y_start = i * strip_h
+ y_end = min((i + 1) * strip_h, h)
+ strip = binary[y_start:y_end, :]
+
+ projection = np.sum(strip, axis=0).astype(np.float64)
+ if projection.max() == 0:
+ continue
+
+ search_w = int(w * 0.4)
+ left_proj = projection[:search_w]
+ if left_proj.max() == 0:
+ continue
+
+ kernel_size = max(3, w // 100)
+ if kernel_size % 2 == 0:
+ kernel_size += 1
+ smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
+ x_pos = float(np.argmax(smoothed))
+ y_center = (y_start + y_end) / 2.0
+ edge_positions.append((y_center, x_pos))
+
+ if len(edge_positions) < 8:
+ return result
+
+ ys = np.array([p[0] for p in edge_positions])
+ xs = np.array([p[1] for p in edge_positions])
+
+ median_x = np.median(xs)
+ std_x = max(np.std(xs), 1.0)
+ mask = np.abs(xs - median_x) < 2 * std_x
+ ys = ys[mask]
+ xs = xs[mask]
+
+ if len(ys) < 6:
+ return result
+
+ straight_coeffs = np.polyfit(ys, xs, 1)
+ slope = straight_coeffs[0]
+ fitted = np.polyval(straight_coeffs, ys)
+ residuals = xs - fitted
+ rmse = float(np.sqrt(np.mean(residuals ** 2)))
+
+ shear_degrees = math.degrees(math.atan(slope))
+
+ confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
+
+ result["shear_degrees"] = round(shear_degrees, 3)
+ result["confidence"] = round(float(confidence), 2)
+
+ return result
+
+
+def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
+ """Detect shear angle by maximising variance of horizontal text-line projections (Method B)."""
+ result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
+
+ h, w = img.shape[:2]
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+ small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
+ sh, sw = small.shape
+
+ def _sweep_variance(angles_list):
+ results = []
+ for angle_deg in angles_list:
+ if abs(angle_deg) < 0.001:
+ rotated = small
+ else:
+ shear_tan = math.tan(math.radians(angle_deg))
+ M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
+ rotated = cv2.warpAffine(small, M, (sw, sh),
+ flags=cv2.INTER_NEAREST,
+ borderMode=cv2.BORDER_CONSTANT)
+ profile = np.sum(rotated, axis=1).astype(float)
+ results.append((angle_deg, float(np.var(profile))))
+ return results
+
+ coarse_angles = [a * 0.5 for a in range(-6, 7)]
+ coarse_results = _sweep_variance(coarse_angles)
+ coarse_best = max(coarse_results, key=lambda x: x[1])
+
+ fine_center = coarse_best[0]
+ fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]
+ fine_results = _sweep_variance(fine_angles)
+ fine_best = max(fine_results, key=lambda x: x[1])
+
+ best_angle = fine_best[0]
+ best_variance = fine_best[1]
+ variances = coarse_results + fine_results
+
+ all_mean = sum(v for _, v in variances) / len(variances)
+ if all_mean > 0 and best_variance > all_mean:
+ confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
+ else:
+ confidence = 0.0
+
+ result["shear_degrees"] = round(best_angle, 3)
+ result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+ return result
+
+
+def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
+ """Detect shear using Hough transform on printed table / ruled lines (Method C)."""
+ result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+ h, w = img.shape[:2]
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+
+ min_len = int(w * 0.15)
+ lines = cv2.HoughLinesP(
+ edges, rho=1, theta=np.pi / 360,
+ threshold=int(w * 0.08),
+ minLineLength=min_len,
+ maxLineGap=20,
+ )
+
+ if lines is None or len(lines) < 3:
+ return result
+
+ horizontal_angles: List[Tuple[float, float]] = []
+ for line in lines:
+ x1, y1, x2, y2 = line[0]
+ if x1 == x2:
+ continue
+ angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
+ if abs(angle) <= 5.0:
+ length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
+ horizontal_angles.append((angle, length))
+
+ if len(horizontal_angles) < 3:
+ return result
+
+ angles_arr = np.array([a for a, _ in horizontal_angles])
+ weights_arr = np.array([l for _, l in horizontal_angles])
+ sorted_idx = np.argsort(angles_arr)
+ s_angles = angles_arr[sorted_idx]
+ s_weights = weights_arr[sorted_idx]
+ cum = np.cumsum(s_weights)
+ mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
+ median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
+
+ agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
+ confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
+
+ shear_degrees = -median_angle
+
+ result["shear_degrees"] = round(shear_degrees, 3)
+ result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+ return result
+
+
+def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
+ """Detect shear by measuring text-line straightness (Method D)."""
+ result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+ h, w = img.shape[:2]
+ scale = 0.5
+ small = cv2.resize(img, (int(w * scale), int(h * scale)),
+ interpolation=cv2.INTER_AREA)
+ gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+ pil_img = Image.fromarray(gray)
+
+ try:
+ data = pytesseract.image_to_data(
+ pil_img, lang='eng+deu', config='--psm 11 --oem 3',
+ output_type=pytesseract.Output.DICT,
+ )
+ except Exception:
+ return result
+
+ words = []
+ for i in range(len(data['text'])):
+ text = data['text'][i].strip()
+ conf = int(data['conf'][i])
+ if not text or conf < 20 or len(text) < 2:
+ continue
+ left_x = float(data['left'][i])
+ cy = data['top'][i] + data['height'][i] / 2.0
+ word_w = float(data['width'][i])
+ words.append((left_x, cy, word_w))
+
+ if len(words) < 15:
+ return result
+
+ avg_w = sum(ww for _, _, ww in words) / len(words)
+ x_tol = max(avg_w * 0.4, 8)
+
+ words_by_x = sorted(words, key=lambda w: w[0])
+ columns: List[List[Tuple[float, float]]] = []
+ cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+ cur_x = words_by_x[0][0]
+
+ for lx, cy, _ in words_by_x[1:]:
+ if abs(lx - cur_x) <= x_tol:
+ cur_col.append((lx, cy))
+ cur_x = cur_x * 0.8 + lx * 0.2
+ else:
+ if len(cur_col) >= 5:
+ columns.append(cur_col)
+ cur_col = [(lx, cy)]
+ cur_x = lx
+ if len(cur_col) >= 5:
+ columns.append(cur_col)
+
+ if len(columns) < 2:
+ return result
+
+ drifts = []
+ for col in columns:
+ ys = np.array([p[1] for p in col])
+ xs = np.array([p[0] for p in col])
+ y_range = ys.max() - ys.min()
+ if y_range < h * scale * 0.3:
+ continue
+ coeffs = np.polyfit(ys, xs, 1)
+ drifts.append(coeffs[0])
+
+ if len(drifts) < 2:
+ return result
+
+ median_drift = float(np.median(drifts))
+ shear_degrees = math.degrees(math.atan(median_drift))
+
+ drift_std = float(np.std(drifts))
+ consistency = max(0.0, 1.0 - drift_std * 50)
+ count_factor = min(1.0, len(drifts) / 4.0)
+ confidence = count_factor * 0.5 + consistency * 0.5
+
+ result["shear_degrees"] = round(shear_degrees, 3)
+ result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+ logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+ "shear=%.3f\u00b0, conf=%.2f",
+ len(columns), len(drifts), median_drift,
+ shear_degrees, confidence)
+ return result
+
+
+# =============================================================================
+# Quality Check and Shear Application
+# =============================================================================
+
+def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
+ """Check whether the dewarp correction actually improved alignment."""
+ def _h_proj_variance(img: np.ndarray) -> float:
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ _, binary = cv2.threshold(gray, 0, 255,
+ cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+ small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
+ interpolation=cv2.INTER_AREA)
+ profile = np.sum(small, axis=1).astype(float)
+ return float(np.var(profile))
+
+ var_before = _h_proj_variance(original)
+ var_after = _h_proj_variance(corrected)
+
+ return var_after > var_before
+
+
+def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+ """Apply a vertical shear correction to an image."""
+ h, w = img.shape[:2]
+ shear_tan = math.tan(math.radians(shear_degrees))
+
+ M = np.float32([
+ [1, shear_tan, -h / 2.0 * shear_tan],
+ [0, 1, 0],
+ ])
+
+ corrected = cv2.warpAffine(img, M, (w, h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+ return corrected
+
+
+# =============================================================================
+# Ensemble Shear Combination
+# =============================================================================
+
+def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
+ """Combine multiple shear detections into a single weighted estimate (v2)."""
+ _MIN_CONF = 0.35
+ _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
+
+ accepted = []
+ for d in detections:
+ if d["confidence"] < _MIN_CONF:
+ continue
+ boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
+ effective_conf = d["confidence"] * boost
+ accepted.append((d["shear_degrees"], effective_conf, d["method"]))
+
+ if not accepted:
+ return 0.0, 0.0, "none"
+
+ if len(accepted) == 1:
+ deg, conf, method = accepted[0]
+ return deg, min(conf, 1.0), method
+
+ total_w = sum(c for _, c, _ in accepted)
+ w_mean = sum(d * c for d, c, _ in accepted) / total_w
+
+ filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
+ if not filtered:
+ filtered = accepted
+
+ total_w2 = sum(c for _, c, _ in filtered)
+ final_deg = sum(d * c for d, c, _ in filtered) / total_w2
+
+ avg_conf = total_w2 / len(filtered)
+ spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
+ agreement_bonus = 0.15 if spread < 0.5 else 0.0
+ ensemble_conf = min(1.0, avg_conf + agreement_bonus)
+
+ methods_str = "+".join(m for _, _, m in filtered)
+ return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
+
+
+# =============================================================================
+# Main Dewarp Function
+# =============================================================================
+
+def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
+ """Correct vertical shear after deskew (v2 with quality gate).
+
+ Methods (all run in ~150ms total):
+ A. _detect_shear_angle() -- vertical edge profile (~50ms)
+ B. _detect_shear_by_projection() -- horizontal text-line variance (~30ms)
+ C. _detect_shear_by_hough() -- Hough lines on table borders (~20ms)
+ D. _detect_shear_by_text_lines() -- text-line straightness (~50ms)
+
+ Args:
+ img: BGR image (already deskewed).
+ use_ensemble: If False, fall back to single-method behaviour (method A only).
+
+ Returns:
+ Tuple of (corrected_image, dewarp_info).
+ """
+ no_correction = {
+ "method": "none",
+ "shear_degrees": 0.0,
+ "confidence": 0.0,
+ "detections": [],
+ }
+
+ if not CV2_AVAILABLE:
+ return img, no_correction
+
+ t0 = time.time()
+
+ if use_ensemble:
+ det_a = _detect_shear_angle(img)
+ det_b = _detect_shear_by_projection(img)
+ det_c = _detect_shear_by_hough(img)
+ det_d = _detect_shear_by_text_lines(img)
+ detections = [det_a, det_b, det_c, det_d]
+ shear_deg, confidence, method = _ensemble_shear(detections)
+ else:
+ det_a = _detect_shear_angle(img)
+ detections = [det_a]
+ shear_deg = det_a["shear_degrees"]
+ confidence = det_a["confidence"]
+ method = det_a["method"]
+
+ duration = time.time() - t0
+
+ logger.info(
+ "dewarp: ensemble shear=%.3f\u00b0 conf=%.2f method=%s (%.2fs) | "
+ "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
+ shear_deg, confidence, method, duration,
+ detections[0]["shear_degrees"], detections[0]["confidence"],
+ detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
+ detections[1]["confidence"] if len(detections) > 1 else 0.0,
+ detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
+ detections[2]["confidence"] if len(detections) > 2 else 0.0,
+ detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
+ detections[3]["confidence"] if len(detections) > 3 else 0.0,
+ )
+
+ _all_detections = [
+ {"method": d["method"], "shear_degrees": d["shear_degrees"],
+ "confidence": d["confidence"]}
+ for d in detections
+ ]
+
+ if abs(shear_deg) < 0.08 or confidence < 0.4:
+ no_correction["detections"] = _all_detections
+ return img, no_correction
+
+ corrected = _apply_shear(img, -shear_deg)
+
+ if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
+ logger.info("dewarp: quality gate REJECTED correction (%.3f\u00b0) -- "
+ "projection variance did not improve", shear_deg)
+ no_correction["detections"] = _all_detections
+ return img, no_correction
+
+ info = {
+ "method": method,
+ "shear_degrees": shear_deg,
+ "confidence": confidence,
+ "detections": _all_detections,
+ }
+
+ return corrected, info
+
+
+def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+ """Apply shear correction with a manual angle."""
+ if abs(shear_degrees) < 0.001:
+ return img
+ return _apply_shear(img, -shear_degrees)
diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py
index 5da85c2..217e463 100644
--- a/klausur-service/backend/cv_review.py
+++ b/klausur-service/backend/cv_review.py
@@ -1,1248 +1,46 @@
"""
Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
+Re-export facade -- all logic lives in the sub-modules:
+
+ cv_review_pipeline Stages 6-8: OCR, line alignment, orchestrator
+ cv_review_spell Rule-based spell-checker OCR correction
+ cv_review_llm LLM-based OCR correction, prompt building, streaming
+
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
-import json
-import logging
-import os
-import re
-import time
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-
-from cv_vocab_types import (
- CV_PIPELINE_AVAILABLE,
- PageRegion,
- PipelineResult,
- VocabRow,
-)
-from cv_preprocessing import (
- deskew_image,
- dewarp_image,
- render_image_high_res,
- render_pdf_high_res,
-)
-from cv_layout import (
- analyze_layout,
- create_layout_image,
- create_ocr_image,
-)
-from cv_ocr_engines import (
- _fix_character_confusion,
- _group_words_into_lines,
+# Re-export everything for backward compatibility
+from cv_review_pipeline import ( # noqa: F401
+ ocr_region,
+ run_multi_pass_ocr,
+ match_lines_to_vocab,
+ llm_post_correct,
+ run_cv_pipeline,
)
-logger = logging.getLogger(__name__)
-
-try:
- import cv2
-except ImportError:
- cv2 = None # type: ignore[assignment]
-
-try:
- import pytesseract
- from PIL import Image
-except ImportError:
- pytesseract = None # type: ignore[assignment]
- Image = None # type: ignore[assignment,misc]
-
-
-# =============================================================================
-# Stage 6: Multi-Pass OCR
-# =============================================================================
-
-def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
- psm: int, fallback_psm: Optional[int] = None,
- min_confidence: float = 40.0) -> List[Dict[str, Any]]:
- """Run Tesseract OCR on a specific region with given PSM.
-
- Args:
- ocr_img: Binarized full-page image.
- region: Region to crop and OCR.
- lang: Tesseract language string.
- psm: Page Segmentation Mode.
- fallback_psm: If confidence too low, retry with this PSM per line.
- min_confidence: Minimum average confidence before fallback.
-
- Returns:
- List of word dicts with text, position, confidence.
- """
- # Crop region
- crop = ocr_img[region.y:region.y + region.height,
- region.x:region.x + region.width]
-
- if crop.size == 0:
- return []
-
- # Convert to PIL for pytesseract
- pil_img = Image.fromarray(crop)
-
- # Run Tesseract with specified PSM
- config = f'--psm {psm} --oem 3'
- try:
- data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
- output_type=pytesseract.Output.DICT)
- except Exception as e:
- logger.warning(f"Tesseract failed for region {region.type}: {e}")
- return []
-
- words = []
- for i in range(len(data['text'])):
- text = data['text'][i].strip()
- conf = int(data['conf'][i])
- if not text or conf < 10:
- continue
- words.append({
- 'text': text,
- 'left': data['left'][i] + region.x, # Absolute coords
- 'top': data['top'][i] + region.y,
- 'width': data['width'][i],
- 'height': data['height'][i],
- 'conf': conf,
- 'region_type': region.type,
- })
-
- # Check average confidence
- if words and fallback_psm is not None:
- avg_conf = sum(w['conf'] for w in words) / len(words)
- if avg_conf < min_confidence:
- logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
- f"trying fallback PSM {fallback_psm}")
- words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
-
- return words
-
-
-def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
- lang: str, psm: int) -> List[Dict[str, Any]]:
- """OCR a region line by line (fallback for low-confidence regions).
-
- Splits the region into horizontal strips based on text density,
- then OCRs each strip individually with the given PSM.
- """
- crop = ocr_img[region.y:region.y + region.height,
- region.x:region.x + region.width]
-
- if crop.size == 0:
- return []
-
- # Find text lines via horizontal projection
- inv = cv2.bitwise_not(crop)
- h_proj = np.sum(inv, axis=1)
- threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
-
- # Find line boundaries
- lines = []
- in_text = False
- line_start = 0
- for y in range(len(h_proj)):
- if h_proj[y] > threshold and not in_text:
- line_start = y
- in_text = True
- elif h_proj[y] <= threshold and in_text:
- if y - line_start > 5: # Minimum line height
- lines.append((line_start, y))
- in_text = False
- if in_text and len(h_proj) - line_start > 5:
- lines.append((line_start, len(h_proj)))
-
- all_words = []
- config = f'--psm {psm} --oem 3'
-
- for line_y_start, line_y_end in lines:
- # Add small padding
- pad = 3
- y1 = max(0, line_y_start - pad)
- y2 = min(crop.shape[0], line_y_end + pad)
- line_crop = crop[y1:y2, :]
-
- if line_crop.size == 0:
- continue
-
- pil_img = Image.fromarray(line_crop)
- try:
- data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
- output_type=pytesseract.Output.DICT)
- except Exception:
- continue
-
- for i in range(len(data['text'])):
- text = data['text'][i].strip()
- conf = int(data['conf'][i])
- if not text or conf < 10:
- continue
- all_words.append({
- 'text': text,
- 'left': data['left'][i] + region.x,
- 'top': data['top'][i] + region.y + y1,
- 'width': data['width'][i],
- 'height': data['height'][i],
- 'conf': conf,
- 'region_type': region.type,
- })
-
- return all_words
-
-
-def run_multi_pass_ocr(ocr_img: np.ndarray,
- regions: List[PageRegion],
- lang: str = "eng+deu") -> Dict[str, List[Dict]]:
- """Run OCR on each detected region with optimized settings.
-
- Args:
- ocr_img: Binarized full-page image.
- regions: Detected page regions.
- lang: Default language.
-
- Returns:
- Dict mapping region type to list of word dicts.
- """
- results: Dict[str, List[Dict]] = {}
-
- _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
- for region in regions:
- if region.type in _ocr_skip:
- continue # Skip non-content regions
-
- if region.type == 'column_en':
- words = ocr_region(ocr_img, region, lang='eng', psm=4)
- elif region.type == 'column_de':
- words = ocr_region(ocr_img, region, lang='deu', psm=4)
- elif region.type == 'column_example':
- words = ocr_region(ocr_img, region, lang=lang, psm=6,
- fallback_psm=7, min_confidence=40.0)
- else:
- words = ocr_region(ocr_img, region, lang=lang, psm=6)
-
- results[region.type] = words
- logger.info(f"OCR {region.type}: {len(words)} words")
-
- return results
-
-
-# =============================================================================
-# Stage 7: Line Alignment → Vocabulary Entries
-# =============================================================================
-
-def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
- regions: List[PageRegion],
- y_tolerance_px: int = 25) -> List[VocabRow]:
- """Align OCR results from different columns into vocabulary rows.
-
- Uses Y-coordinate matching to pair English words, German translations,
- and example sentences that appear on the same line.
-
- Args:
- ocr_results: Dict mapping region type to word lists.
- regions: Detected regions (for reference).
- y_tolerance_px: Max Y-distance to consider words on the same row.
-
- Returns:
- List of VocabRow objects.
- """
- # If no vocabulary columns detected (e.g. plain text page), return empty
- if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
- logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
- return []
-
- # Group words into lines per column
- en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
- de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
- ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
-
- def line_y_center(line: List[Dict]) -> float:
- return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
-
- def line_text(line: List[Dict]) -> str:
- return ' '.join(w['text'] for w in line)
-
- def line_confidence(line: List[Dict]) -> float:
- return sum(w['conf'] for w in line) / len(line) if line else 0
-
- # Build EN entries as the primary reference
- vocab_rows: List[VocabRow] = []
-
- for en_line in en_lines:
- en_y = line_y_center(en_line)
- en_text = line_text(en_line)
- en_conf = line_confidence(en_line)
-
- # Skip very short or likely header content
- if len(en_text.strip()) < 2:
- continue
-
- # Find matching DE line
- de_text = ""
- de_conf = 0.0
- best_de_dist = float('inf')
- best_de_idx = -1
- for idx, de_line in enumerate(de_lines):
- dist = abs(line_y_center(de_line) - en_y)
- if dist < y_tolerance_px and dist < best_de_dist:
- best_de_dist = dist
- best_de_idx = idx
-
- if best_de_idx >= 0:
- de_text = line_text(de_lines[best_de_idx])
- de_conf = line_confidence(de_lines[best_de_idx])
-
- # Find matching example line
- ex_text = ""
- ex_conf = 0.0
- best_ex_dist = float('inf')
- best_ex_idx = -1
- for idx, ex_line in enumerate(ex_lines):
- dist = abs(line_y_center(ex_line) - en_y)
- if dist < y_tolerance_px and dist < best_ex_dist:
- best_ex_dist = dist
- best_ex_idx = idx
-
- if best_ex_idx >= 0:
- ex_text = line_text(ex_lines[best_ex_idx])
- ex_conf = line_confidence(ex_lines[best_ex_idx])
-
- avg_conf = en_conf
- conf_count = 1
- if de_conf > 0:
- avg_conf += de_conf
- conf_count += 1
- if ex_conf > 0:
- avg_conf += ex_conf
- conf_count += 1
-
- vocab_rows.append(VocabRow(
- english=en_text.strip(),
- german=de_text.strip(),
- example=ex_text.strip(),
- confidence=avg_conf / conf_count,
- y_position=int(en_y),
- ))
-
- # Handle multi-line wrapping in example column:
- # If an example line has no matching EN/DE, append to previous entry
- matched_ex_ys = set()
- for row in vocab_rows:
- if row.example:
- matched_ex_ys.add(row.y_position)
-
- for ex_line in ex_lines:
- ex_y = line_y_center(ex_line)
- # Check if already matched
- already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
- if already_matched:
- continue
-
- # Find nearest previous vocab row
- best_row = None
- best_dist = float('inf')
- for row in vocab_rows:
- dist = ex_y - row.y_position
- if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
- best_dist = dist
- best_row = row
-
- if best_row:
- continuation = line_text(ex_line).strip()
- if continuation:
- best_row.example = (best_row.example + " " + continuation).strip()
-
- # Sort by Y position
- vocab_rows.sort(key=lambda r: r.y_position)
-
- return vocab_rows
-
-
-# =============================================================================
-# Stage 8: Optional LLM Post-Correction
-# =============================================================================
-
-async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
- confidence_threshold: float = 50.0,
- enabled: bool = False) -> List[VocabRow]:
- """Optionally send low-confidence regions to Qwen-VL for correction.
-
- Default: disabled. Enable per parameter.
-
- Args:
- img: Original BGR image.
- vocab_rows: Current vocabulary rows.
- confidence_threshold: Rows below this get LLM correction.
- enabled: Whether to actually run LLM correction.
-
- Returns:
- Corrected vocabulary rows.
- """
- if not enabled:
- return vocab_rows
-
- # TODO: Implement Qwen-VL correction for low-confidence entries
- # For each row with confidence < threshold:
- # 1. Crop the relevant region from img
- # 2. Send crop + OCR text to Qwen-VL
- # 3. Replace text if LLM provides a confident correction
- logger.info(f"LLM post-correction skipped (not yet implemented)")
- return vocab_rows
-
-
-# =============================================================================
-# Orchestrator
-# =============================================================================
-
-async def run_cv_pipeline(
- pdf_data: Optional[bytes] = None,
- image_data: Optional[bytes] = None,
- page_number: int = 0,
- zoom: float = 3.0,
- enable_dewarp: bool = True,
- enable_llm_correction: bool = False,
- lang: str = "eng+deu",
-) -> PipelineResult:
- """Run the complete CV document reconstruction pipeline.
-
- Args:
- pdf_data: Raw PDF bytes (mutually exclusive with image_data).
- image_data: Raw image bytes (mutually exclusive with pdf_data).
- page_number: 0-indexed page number (for PDF).
- zoom: PDF rendering zoom factor.
- enable_dewarp: Whether to run dewarp stage.
- enable_llm_correction: Whether to run LLM post-correction.
- lang: Tesseract language string.
-
- Returns:
- PipelineResult with vocabulary and timing info.
- """
- if not CV_PIPELINE_AVAILABLE:
- return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
-
- result = PipelineResult()
- total_start = time.time()
-
- try:
- # Stage 1: Render
- t = time.time()
- if pdf_data:
- img = render_pdf_high_res(pdf_data, page_number, zoom)
- elif image_data:
- img = render_image_high_res(image_data)
- else:
- return PipelineResult(error="No input data (pdf_data or image_data required)")
- result.stages['render'] = round(time.time() - t, 2)
- result.image_width = img.shape[1]
- result.image_height = img.shape[0]
- logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
-
- # Stage 2: Deskew
- t = time.time()
- img, angle = deskew_image(img)
- result.stages['deskew'] = round(time.time() - t, 2)
- logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
-
- # Stage 3: Dewarp
- if enable_dewarp:
- t = time.time()
- img, _dewarp_info = dewarp_image(img)
- result.stages['dewarp'] = round(time.time() - t, 2)
-
- # Stage 4: Dual image preparation
- t = time.time()
- ocr_img = create_ocr_image(img)
- layout_img = create_layout_image(img)
- result.stages['image_prep'] = round(time.time() - t, 2)
-
- # Stage 5: Layout analysis
- t = time.time()
- regions = analyze_layout(layout_img, ocr_img)
- result.stages['layout'] = round(time.time() - t, 2)
- result.columns_detected = len([r for r in regions if r.type.startswith('column')])
- logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
-
- # Stage 6: Multi-pass OCR
- t = time.time()
- ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
- result.stages['ocr'] = round(time.time() - t, 2)
- total_words = sum(len(w) for w in ocr_results.values())
- result.word_count = total_words
- logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
-
- # Stage 7: Line alignment
- t = time.time()
- vocab_rows = match_lines_to_vocab(ocr_results, regions)
- result.stages['alignment'] = round(time.time() - t, 2)
-
- # Stage 8: Optional LLM correction
- if enable_llm_correction:
- t = time.time()
- vocab_rows = await llm_post_correct(img, vocab_rows)
- result.stages['llm_correction'] = round(time.time() - t, 2)
-
- # Convert to output format
- result.vocabulary = [
- {
- "english": row.english,
- "german": row.german,
- "example": row.example,
- "confidence": round(row.confidence, 1),
- }
- for row in vocab_rows
- if row.english or row.german # Skip empty rows
- ]
-
- result.duration_seconds = round(time.time() - total_start, 2)
- logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
-
- except Exception as e:
- logger.error(f"CV Pipeline error: {e}")
- import traceback
- logger.debug(traceback.format_exc())
- result.error = str(e)
- result.duration_seconds = round(time.time() - total_start, 2)
-
- return result
-
-
-# ---------------------------------------------------------------------------
-# LLM-based OCR Correction (Step 6)
-# ---------------------------------------------------------------------------
-
-import httpx
-import os
-import json as _json
-import re as _re
-
-_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
-OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
-_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
-logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
-
-# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
-_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
-
-# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
-# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
-# when they appear inside or next to a word character.
-_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
-
-
-def _entry_needs_review(entry: Dict) -> bool:
- """Check if an entry should be sent to the LLM for review.
-
- Sends all non-empty entries that don't have IPA phonetic transcriptions.
- The LLM prompt and _is_spurious_change() guard against unwanted changes.
- """
- en = entry.get("english", "") or ""
- de = entry.get("german", "") or ""
-
- # Skip completely empty entries
- if not en.strip() and not de.strip():
- return False
- # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
- if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
- return False
- return True
-
-
-def _build_llm_prompt(table_lines: List[Dict]) -> str:
- """Build the LLM correction prompt for a batch of entries."""
- return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
-
-DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
-
-NUR diese Korrekturen sind erlaubt:
-- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
-- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
-- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
-- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
-- Ziffer 6 statt G oder g: "6eld" → "Geld"
-- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
-
-ABSOLUT VERBOTEN — aendere NIEMALS:
-- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
-- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
-- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
-- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
-- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
-- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
-- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
-- Beispielsaetze in der ex-Spalte — NIEMALS aendern
-
-Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
-
-Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
-Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
-
-/no_think
-
-Eingabe:
-{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
-
-
-def _is_spurious_change(old_val: str, new_val: str) -> bool:
- """Detect LLM changes that are likely wrong and should be discarded.
-
- Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
- legitimate OCR corrections. Everything else is rejected.
-
- Filters out:
- - Case-only changes
- - Changes that don't contain any digit→letter fix
- - Completely different words (LLM translating or hallucinating)
- - Additions or removals of whole words (count changed)
- """
- if not old_val or not new_val:
- return False
-
- # Case-only change — never a real OCR error
- if old_val.lower() == new_val.lower():
- return True
-
- # If the word count changed significantly, the LLM rewrote rather than fixed
- old_words = old_val.split()
- new_words = new_val.split()
- if abs(len(old_words) - len(new_words)) > 1:
- return True
-
- # Core rule: a legitimate correction replaces a digit with the corresponding
- # letter. If the change doesn't include such a substitution, reject it.
- # Build a set of (old_char, new_char) pairs that differ between old and new.
- # Use character-level diff heuristic: if lengths are close, zip and compare.
- # Map of characters that OCR commonly misreads → set of correct replacements
- _OCR_CHAR_MAP = {
- # Digits mistaken for letters
- '0': set('oOgG'),
- '1': set('lLiI'),
- '5': set('sS'),
- '6': set('gG'),
- '8': set('bB'),
- # Non-letter symbols mistaken for letters
- '|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1
- 'l': set('iI|1'), # lowercase l → capital I (and reverse)
- }
- has_valid_fix = False
- if len(old_val) == len(new_val):
- for oc, nc in zip(old_val, new_val):
- if oc != nc:
- if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
- has_valid_fix = True
- elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
- # Reverse check (e.g. l→I where new is the "correct" char)
- has_valid_fix = True
- else:
- # Length changed by 1: accept if old had a suspicious char sequence
- _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
- if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
- has_valid_fix = True
-
- if not has_valid_fix:
- return True # Reject — looks like translation or hallucination
-
- return False
-
-
-def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
- """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
- changes = []
- entries_out = []
- for i, orig in enumerate(originals):
- if i < len(corrected):
- c = corrected[i]
- entry = dict(orig)
- for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
- new_val = c.get(key, "").strip()
- old_val = (orig.get(field_name, "") or "").strip()
- if new_val and new_val != old_val:
- # Filter spurious LLM changes
- if _is_spurious_change(old_val, new_val):
- continue
- changes.append({
- "row_index": orig.get("row_index", i),
- "field": field_name,
- "old": old_val,
- "new": new_val,
- })
- entry[field_name] = new_val
- entry["llm_corrected"] = True
- entries_out.append(entry)
- else:
- entries_out.append(dict(orig))
- return changes, entries_out
-
-
-# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
-
-REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
-
-try:
- from spellchecker import SpellChecker as _SpellChecker
- _en_spell = _SpellChecker(language='en', distance=1)
- _de_spell = _SpellChecker(language='de', distance=1)
- _SPELL_AVAILABLE = True
- logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
-except ImportError:
- _SPELL_AVAILABLE = False
- logger.warning("pyspellchecker not installed — falling back to LLM review")
-
-# ─── Page-Ref Normalization ───────────────────────────────────────────────────
-# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
-_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
-
-
-def _normalize_page_ref(text: str) -> str:
- """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
- if not text:
- return text
- return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
-
-
-# Suspicious OCR chars → ordered list of most-likely correct replacements
-_SPELL_SUBS: Dict[str, List[str]] = {
- '0': ['O', 'o'],
- '1': ['l', 'I'],
- '5': ['S', 's'],
- '6': ['G', 'g'],
- '8': ['B', 'b'],
- '|': ['I', 'l', '1'],
-}
-_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
-
-# Tokenizer: word tokens (letters + pipe) alternating with separators
-_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
-
-
-def _spell_dict_knows(word: str) -> bool:
- """True if word is known in EN or DE dictionary."""
- if not _SPELL_AVAILABLE:
- return False
- w = word.lower()
- return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
-
-
-def _try_split_merged_word(token: str) -> Optional[str]:
- """Try to split a merged word like 'atmyschool' into 'at my school'.
-
- Uses dynamic programming to find the shortest sequence of dictionary
- words that covers the entire token. Only returns a result when the
- split produces at least 2 words and ALL parts are known dictionary words.
-
- Preserves original capitalisation by mapping back to the input string.
- """
- if not _SPELL_AVAILABLE or len(token) < 4:
- return None
-
- lower = token.lower()
- n = len(lower)
-
- # dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
- # Score: (-word_count, sum_of_squared_lengths) — fewer words first,
- # then prefer longer words (e.g. "come on" over "com eon")
- dp: list = [None] * (n + 1)
- dp[0] = ([], 0)
-
- for i in range(1, n + 1):
- for j in range(max(0, i - 20), i):
- if dp[j] is None:
- continue
- candidate = lower[j:i]
- word_len = i - j
- if word_len == 1 and candidate not in ('a', 'i'):
- continue
- if _spell_dict_knows(candidate):
- prev_words, prev_sq = dp[j]
- new_words = prev_words + [word_len]
- new_sq = prev_sq + word_len * word_len
- new_key = (-len(new_words), new_sq)
- if dp[i] is None:
- dp[i] = (new_words, new_sq)
- else:
- old_key = (-len(dp[i][0]), dp[i][1])
- if new_key >= old_key:
- # >= so that later splits (longer first word) win ties
- dp[i] = (new_words, new_sq)
-
- if dp[n] is None or len(dp[n][0]) < 2:
- return None
-
- # Reconstruct with original casing
- result = []
- pos = 0
- for wlen in dp[n][0]:
- result.append(token[pos:pos + wlen])
- pos += wlen
-
- logger.debug("Split merged word: %r → %r", token, " ".join(result))
- return " ".join(result)
-
-
-def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
- """Return corrected form of token, or None if no fix needed/possible.
-
- *field* is 'english' or 'german' — used to pick the right dictionary
- for general spell correction (step 3 below).
- """
- has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
-
- # 1. Already known word → no fix needed
- if _spell_dict_knows(token):
- return None
-
- # 2. Digit/pipe substitution (existing logic)
- if has_suspicious:
- # Standalone pipe → capital I
- if token == '|':
- return 'I'
- # Dictionary-backed single-char substitution
- for i, ch in enumerate(token):
- if ch not in _SPELL_SUBS:
- continue
- for replacement in _SPELL_SUBS[ch]:
- candidate = token[:i] + replacement + token[i + 1:]
- if _spell_dict_knows(candidate):
- return candidate
- # Structural rule: suspicious char at position 0 + rest is all lowercase letters
- first = token[0]
- if first in _SPELL_SUBS and len(token) >= 2:
- rest = token[1:]
- if rest.isalpha() and rest.islower():
- candidate = _SPELL_SUBS[first][0] + rest
- if not candidate[0].isdigit():
- return candidate
-
- # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
- # Try single-char umlaut substitutions and check against dictionary.
- if len(token) >= 3 and token.isalpha() and field == "german":
- _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
- 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
- for i, ch in enumerate(token):
- if ch in _UMLAUT_SUBS:
- candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
- if _spell_dict_knows(candidate):
- return candidate
-
- # 4. General spell correction for unknown words (no digits/pipes)
- # e.g. "beautful" → "beautiful"
- if not has_suspicious and len(token) >= 3 and token.isalpha():
- spell = _en_spell if field == "english" else _de_spell if field == "german" else None
- if spell is not None:
- correction = spell.correction(token.lower())
- if correction and correction != token.lower():
- # Preserve original capitalisation pattern
- if token[0].isupper():
- correction = correction[0].upper() + correction[1:]
- if _spell_dict_knows(correction):
- return correction
-
- # 5. Merged-word split: OCR often merges adjacent words when spacing
- # is too tight, e.g. "atmyschool" → "at my school"
- if len(token) >= 4 and token.isalpha():
- split = _try_split_merged_word(token)
- if split:
- return split
-
- return None
-
-
-def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
- """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
-
- *field* is 'english' or 'german' — forwarded to _spell_fix_token for
- dictionary selection.
- """
- if not text:
- return text, False
- has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
- # If no suspicious chars AND no alpha chars that could be misspelled, skip
- if not has_suspicious and not any(c.isalpha() for c in text):
- return text, False
- # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
- fixed = _re.sub(r'(? Dict:
- """Rule-based OCR correction: spell-checker + structural heuristics.
-
- Deterministic — never translates, never touches IPA, never hallucinates.
- Uses SmartSpellChecker for language-aware corrections with context-based
- disambiguation (a/I), multi-digit substitution, and cross-language guard.
- """
- t0 = time.time()
- changes: List[Dict] = []
- all_corrected: List[Dict] = []
-
- # Use SmartSpellChecker if available, fall back to legacy _spell_fix_field
- _smart = None
- try:
- from smart_spell import SmartSpellChecker
- _smart = SmartSpellChecker()
- logger.debug("spell_review: using SmartSpellChecker")
- except Exception:
- logger.debug("spell_review: SmartSpellChecker not available, using legacy")
-
- # Map field names → language codes for SmartSpellChecker
- _LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
-
- for i, entry in enumerate(entries):
- e = dict(entry)
- # Page-ref normalization (always, regardless of review status)
- old_ref = (e.get("source_page") or "").strip()
- if old_ref:
- new_ref = _normalize_page_ref(old_ref)
- if new_ref != old_ref:
- changes.append({
- "row_index": e.get("row_index", i),
- "field": "source_page",
- "old": old_ref,
- "new": new_ref,
- })
- e["source_page"] = new_ref
- e["llm_corrected"] = True
- if not _entry_needs_review(e):
- all_corrected.append(e)
- continue
- for field_name in ("english", "german", "example"):
- old_val = (e.get(field_name) or "").strip()
- if not old_val:
- continue
-
- if _smart:
- # SmartSpellChecker path — language-aware, context-based
- lang_code = _LANG_MAP.get(field_name, "en")
- result = _smart.correct_text(old_val, lang=lang_code)
- new_val = result.corrected
- was_changed = result.changed
- else:
- # Legacy path
- lang = "german" if field_name in ("german", "example") else "english"
- new_val, was_changed = _spell_fix_field(old_val, field=lang)
-
- if was_changed and new_val != old_val:
- changes.append({
- "row_index": e.get("row_index", i),
- "field": field_name,
- "old": old_val,
- "new": new_val,
- })
- e[field_name] = new_val
- e["llm_corrected"] = True
- all_corrected.append(e)
- duration_ms = int((time.time() - t0) * 1000)
- model_name = "smart-spell-checker" if _smart else "spell-checker"
- return {
- "entries_original": entries,
- "entries_corrected": all_corrected,
- "changes": changes,
- "skipped_count": 0,
- "model_used": model_name,
- "duration_ms": duration_ms,
- }
-
-
-async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
- """Async generator yielding SSE-compatible events for spell-checker review."""
- total = len(entries)
- yield {
- "type": "meta",
- "total_entries": total,
- "to_review": total,
- "skipped": 0,
- "model": "spell-checker",
- "batch_size": batch_size,
- }
- result = spell_review_entries_sync(entries)
- changes = result["changes"]
- yield {
- "type": "batch",
- "batch_index": 0,
- "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
- "changes": changes,
- "duration_ms": result["duration_ms"],
- "progress": {"current": total, "total": total},
- }
- yield {
- "type": "complete",
- "changes": changes,
- "model_used": "spell-checker",
- "duration_ms": result["duration_ms"],
- "total_entries": total,
- "reviewed": total,
- "skipped": 0,
- "corrections_found": len(changes),
- "entries_corrected": result["entries_corrected"],
- }
-
-# ─── End Spell-Checker ────────────────────────────────────────────────────────
-
-
-async def llm_review_entries(
- entries: List[Dict],
- model: str = None,
-) -> Dict:
- """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
- if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
- return spell_review_entries_sync(entries)
- if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
- logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
- model = model or OLLAMA_REVIEW_MODEL
-
- # Filter: only entries that need review
- reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
-
- if not reviewable:
- return {
- "entries_original": entries,
- "entries_corrected": [dict(e) for e in entries],
- "changes": [],
- "skipped_count": len(entries),
- "model_used": model,
- "duration_ms": 0,
- }
-
- review_entries = [e for _, e in reviewable]
- table_lines = [
- {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
- for e in review_entries
- ]
-
- logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
- len(review_entries), len(entries), model, len(entries) - len(reviewable))
- logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
-
- prompt = _build_llm_prompt(table_lines)
-
- t0 = time.time()
- async with httpx.AsyncClient(timeout=300.0) as client:
- resp = await client.post(
- f"{_OLLAMA_URL}/api/chat",
- json={
- "model": model,
- "messages": [{"role": "user", "content": prompt}],
- "stream": False,
- "think": False, # qwen3: disable chain-of-thought (Ollama >=0.6)
- "options": {"temperature": 0.1, "num_predict": 8192},
- },
- )
- resp.raise_for_status()
- content = resp.json().get("message", {}).get("content", "")
- duration_ms = int((time.time() - t0) * 1000)
-
- logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
- logger.debug("LLM review raw response (first 500): %.500s", content)
-
- corrected = _parse_llm_json_array(content)
- logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
- changes, corrected_entries = _diff_batch(review_entries, corrected)
-
- # Merge corrected entries back into the full list
- all_corrected = [dict(e) for e in entries]
- for batch_idx, (orig_idx, _) in enumerate(reviewable):
- if batch_idx < len(corrected_entries):
- all_corrected[orig_idx] = corrected_entries[batch_idx]
-
- return {
- "entries_original": entries,
- "entries_corrected": all_corrected,
- "changes": changes,
- "skipped_count": len(entries) - len(reviewable),
- "model_used": model,
- "duration_ms": duration_ms,
- }
-
-
-async def llm_review_entries_streaming(
- entries: List[Dict],
- model: str = None,
- batch_size: int = _REVIEW_BATCH_SIZE,
-):
- """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
-
- Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
- visible in the UI — this is the only place the fix now runs (removed from Step 1
- of build_vocab_pipeline_streaming).
- """
- # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
- _CONF_FIELDS = ('english', 'german', 'example')
- originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
- _fix_character_confusion(entries) # modifies in-place, returns same list
- char_changes = [
- {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
- for i in range(len(entries))
- for f in _CONF_FIELDS
- if originals[i][f] != entries[i].get(f, '')
- ]
-
- if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
- # Inject char_changes as a batch right after the meta event from the spell checker
- _meta_sent = False
- async for event in spell_review_entries_streaming(entries, batch_size):
- yield event
- if not _meta_sent and event.get('type') == 'meta' and char_changes:
- _meta_sent = True
- yield {
- 'type': 'batch',
- 'changes': char_changes,
- 'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
- 'progress': {'current': 0, 'total': len(entries)},
- }
- return
-
- if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
- logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
- # LLM path: emit char_changes first (before meta) so they appear in the UI
- if char_changes:
- yield {
- 'type': 'batch',
- 'changes': char_changes,
- 'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
- 'progress': {'current': 0, 'total': len(entries)},
- }
-
- model = model or OLLAMA_REVIEW_MODEL
-
- # Separate reviewable from skipped entries
- reviewable = []
- skipped_indices = []
- for i, e in enumerate(entries):
- if _entry_needs_review(e):
- reviewable.append((i, e))
- else:
- skipped_indices.append(i)
-
- total_to_review = len(reviewable)
-
- # meta event
- yield {
- "type": "meta",
- "total_entries": len(entries),
- "to_review": total_to_review,
- "skipped": len(skipped_indices),
- "model": model,
- "batch_size": batch_size,
- }
-
- all_changes = []
- all_corrected = [dict(e) for e in entries]
- total_duration_ms = 0
- reviewed_count = 0
-
- # Process in batches
- for batch_start in range(0, total_to_review, batch_size):
- batch_items = reviewable[batch_start:batch_start + batch_size]
- batch_entries = [e for _, e in batch_items]
-
- table_lines = [
- {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
- for e in batch_entries
- ]
-
- prompt = _build_llm_prompt(table_lines)
-
- logger.info("LLM review streaming: batch %d — sending %d entries to %s",
- batch_start // batch_size, len(batch_entries), model)
-
- t0 = time.time()
- async with httpx.AsyncClient(timeout=300.0) as client:
- resp = await client.post(
- f"{_OLLAMA_URL}/api/chat",
- json={
- "model": model,
- "messages": [{"role": "user", "content": prompt}],
- "stream": False,
- "think": False, # qwen3: disable chain-of-thought
- "options": {"temperature": 0.1, "num_predict": 8192},
- },
- )
- resp.raise_for_status()
- content = resp.json().get("message", {}).get("content", "")
- batch_ms = int((time.time() - t0) * 1000)
- total_duration_ms += batch_ms
-
- logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
- logger.debug("LLM review streaming raw (first 500): %.500s", content)
-
- corrected = _parse_llm_json_array(content)
- logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
- batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
-
- # Merge back
- for batch_idx, (orig_idx, _) in enumerate(batch_items):
- if batch_idx < len(batch_corrected):
- all_corrected[orig_idx] = batch_corrected[batch_idx]
-
- all_changes.extend(batch_changes)
- reviewed_count += len(batch_items)
-
- # Yield batch result
- yield {
- "type": "batch",
- "batch_index": batch_start // batch_size,
- "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
- "changes": batch_changes,
- "duration_ms": batch_ms,
- "progress": {"current": reviewed_count, "total": total_to_review},
- }
-
- # Complete event
- yield {
- "type": "complete",
- "changes": all_changes,
- "model_used": model,
- "duration_ms": total_duration_ms,
- "total_entries": len(entries),
- "reviewed": total_to_review,
- "skipped": len(skipped_indices),
- "corrections_found": len(all_changes),
- "entries_corrected": all_corrected,
- }
-
-
-def _sanitize_for_json(text: str) -> str:
- """Remove or escape control characters that break JSON parsing.
-
- Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
- JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
- that are only valid inside JSON strings when properly escaped.
- """
- # Replace literal control chars (except \\t \\n \\r) with a space
- return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
-
-
-def _parse_llm_json_array(text: str) -> List[Dict]:
- """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
- # Strip qwen3
', '\n', html_content, flags=re.IGNORECASE)
+ html_content = re.sub(r'