[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions
--- a/klausur-service/backend/services/grid_detection_service.py
+++ b/klausur-service/backend/services/grid_detection_service.py
@@ -10,166 +10,21 @@ Lizenz: Apache 2.0 (kommerziell nutzbar)

 import math
 import logging
-from enum import Enum
-from dataclasses import dataclass, field
-from typing import List, Optional, Dict, Any, Tuple
+from typing import List
+
+from .grid_detection_models import (
+    A4_WIDTH_MM,
+    A4_HEIGHT_MM,
+    COLUMN_MARGIN_MM,
+    CellStatus,
+    ColumnType,
+    OCRRegion,
+    GridCell,
+    GridResult,
+)

 logger = logging.getLogger(__name__)

-# A4 dimensions
-A4_WIDTH_MM = 210.0
-A4_HEIGHT_MM = 297.0
-
-# Column margin (1mm)
-COLUMN_MARGIN_MM = 1.0
-COLUMN_MARGIN_PCT = (COLUMN_MARGIN_MM / A4_WIDTH_MM) * 100
-
-
-class CellStatus(str, Enum):
-    EMPTY = "empty"
-    RECOGNIZED = "recognized"
-    PROBLEMATIC = "problematic"
-    MANUAL = "manual"
-
-
-class ColumnType(str, Enum):
-    ENGLISH = "english"
-    GERMAN = "german"
-    EXAMPLE = "example"
-    UNKNOWN = "unknown"
-
-
-@dataclass
-class OCRRegion:
-    """A word/phrase detected by OCR with bounding box coordinates in percentage (0-100)."""
-    text: str
-    confidence: float
-    x: float       # X position as percentage of page width
-    y: float       # Y position as percentage of page height
-    width: float   # Width as percentage of page width
-    height: float  # Height as percentage of page height
-
-    @property
-    def x_mm(self) -> float:
-        return round(self.x / 100 * A4_WIDTH_MM, 1)
-
-    @property
-    def y_mm(self) -> float:
-        return round(self.y / 100 * A4_HEIGHT_MM, 1)
-
-    @property
-    def width_mm(self) -> float:
-        return round(self.width / 100 * A4_WIDTH_MM, 1)
-
-    @property
-    def height_mm(self) -> float:
-        return round(self.height / 100 * A4_HEIGHT_MM, 2)
-
-    @property
-    def center_x(self) -> float:
-        return self.x + self.width / 2
-
-    @property
-    def center_y(self) -> float:
-        return self.y + self.height / 2
-
-    @property
-    def right(self) -> float:
-        return self.x + self.width
-
-    @property
-    def bottom(self) -> float:
-        return self.y + self.height
-
-
-@dataclass
-class GridCell:
-    """A cell in the detected grid with coordinates in percentage (0-100)."""
-    row: int
-    col: int
-    x: float
-    y: float
-    width: float
-    height: float
-    text: str = ""
-    confidence: float = 0.0
-    status: CellStatus = CellStatus.EMPTY
-    column_type: ColumnType = ColumnType.UNKNOWN
-    logical_row: int = 0
-    logical_col: int = 0
-    is_continuation: bool = False
-
-    @property
-    def x_mm(self) -> float:
-        return round(self.x / 100 * A4_WIDTH_MM, 1)
-
-    @property
-    def y_mm(self) -> float:
-        return round(self.y / 100 * A4_HEIGHT_MM, 1)
-
-    @property
-    def width_mm(self) -> float:
-        return round(self.width / 100 * A4_WIDTH_MM, 1)
-
-    @property
-    def height_mm(self) -> float:
-        return round(self.height / 100 * A4_HEIGHT_MM, 2)
-
-    def to_dict(self) -> dict:
-        return {
-            "row": self.row,
-            "col": self.col,
-            "x": round(self.x, 2),
-            "y": round(self.y, 2),
-            "width": round(self.width, 2),
-            "height": round(self.height, 2),
-            "x_mm": self.x_mm,
-            "y_mm": self.y_mm,
-            "width_mm": self.width_mm,
-            "height_mm": self.height_mm,
-            "text": self.text,
-            "confidence": self.confidence,
-            "status": self.status.value,
-            "column_type": self.column_type.value,
-            "logical_row": self.logical_row,
-            "logical_col": self.logical_col,
-            "is_continuation": self.is_continuation,
-        }
-
-
-@dataclass
-class GridResult:
-    """Result of grid detection."""
-    rows: int = 0
-    columns: int = 0
-    cells: List[List[GridCell]] = field(default_factory=list)
-    column_types: List[str] = field(default_factory=list)
-    column_boundaries: List[float] = field(default_factory=list)
-    row_boundaries: List[float] = field(default_factory=list)
-    deskew_angle: float = 0.0
-    stats: Dict[str, Any] = field(default_factory=dict)
-
-    def to_dict(self) -> dict:
-        cells_dicts = []
-        for row_cells in self.cells:
-            cells_dicts.append([c.to_dict() for c in row_cells])
-
-        return {
-            "rows": self.rows,
-            "columns": self.columns,
-            "cells": cells_dicts,
-            "column_types": self.column_types,
-            "column_boundaries": [round(b, 2) for b in self.column_boundaries],
-            "row_boundaries": [round(b, 2) for b in self.row_boundaries],
-            "deskew_angle": round(self.deskew_angle, 2),
-            "stats": self.stats,
-            "page_dimensions": {
-                "width_mm": A4_WIDTH_MM,
-                "height_mm": A4_HEIGHT_MM,
-                "format": "A4",
-            },
-        }
-

 class GridDetectionService:
    """Detect grid/table structure from OCR bounding-box regions."""
@@ -184,7 +39,7 @@ class GridDetectionService:
        """Calculate page skew angle from OCR region positions.

        Uses left-edge alignment of regions to detect consistent tilt.
-        Returns angle in degrees, clamped to ±5°.
+        Returns angle in degrees, clamped to +/-5 degrees.
        """
        if len(regions) < 3:
            return 0.0
@@ -229,12 +84,12 @@ class GridDetectionService:
        slope = (n * sum_xy - sum_y * sum_x) / denom

        # Convert slope to angle (slope is dx/dy in percent space)
-        # Adjust for aspect ratio: A4 is 210/297 ≈ 0.707
+        # Adjust for aspect ratio: A4 is 210/297 ~ 0.707
        aspect = A4_WIDTH_MM / A4_HEIGHT_MM
        angle_rad = math.atan(slope * aspect)
        angle_deg = math.degrees(angle_rad)

-        # Clamp to ±5°
+        # Clamp to +/-5 degrees
        return max(-5.0, min(5.0, round(angle_deg, 2)))

    def apply_deskew_to_regions(self, regions: List[OCRRegion], angle: float) -> List[OCRRegion]: