[split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
164
klausur-service/backend/services/grid_detection_models.py
Normal file
164
klausur-service/backend/services/grid_detection_models.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Grid Detection Models v4
|
||||
|
||||
Data classes for OCR grid detection results.
|
||||
Coordinates use percentage (0-100) and mm (A4 format).
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# A4 dimensions
|
||||
A4_WIDTH_MM = 210.0
|
||||
A4_HEIGHT_MM = 297.0
|
||||
|
||||
# Column margin (1mm)
|
||||
COLUMN_MARGIN_MM = 1.0
|
||||
COLUMN_MARGIN_PCT = (COLUMN_MARGIN_MM / A4_WIDTH_MM) * 100
|
||||
|
||||
|
||||
class CellStatus(str, Enum):
|
||||
EMPTY = "empty"
|
||||
RECOGNIZED = "recognized"
|
||||
PROBLEMATIC = "problematic"
|
||||
MANUAL = "manual"
|
||||
|
||||
|
||||
class ColumnType(str, Enum):
|
||||
ENGLISH = "english"
|
||||
GERMAN = "german"
|
||||
EXAMPLE = "example"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRRegion:
|
||||
"""A word/phrase detected by OCR with bounding box coordinates in percentage (0-100)."""
|
||||
text: str
|
||||
confidence: float
|
||||
x: float # X position as percentage of page width
|
||||
y: float # Y position as percentage of page height
|
||||
width: float # Width as percentage of page width
|
||||
height: float # Height as percentage of page height
|
||||
|
||||
@property
|
||||
def x_mm(self) -> float:
|
||||
return round(self.x / 100 * A4_WIDTH_MM, 1)
|
||||
|
||||
@property
|
||||
def y_mm(self) -> float:
|
||||
return round(self.y / 100 * A4_HEIGHT_MM, 1)
|
||||
|
||||
@property
|
||||
def width_mm(self) -> float:
|
||||
return round(self.width / 100 * A4_WIDTH_MM, 1)
|
||||
|
||||
@property
|
||||
def height_mm(self) -> float:
|
||||
return round(self.height / 100 * A4_HEIGHT_MM, 2)
|
||||
|
||||
@property
|
||||
def center_x(self) -> float:
|
||||
return self.x + self.width / 2
|
||||
|
||||
@property
|
||||
def center_y(self) -> float:
|
||||
return self.y + self.height / 2
|
||||
|
||||
@property
|
||||
def right(self) -> float:
|
||||
return self.x + self.width
|
||||
|
||||
@property
|
||||
def bottom(self) -> float:
|
||||
return self.y + self.height
|
||||
|
||||
|
||||
@dataclass
|
||||
class GridCell:
|
||||
"""A cell in the detected grid with coordinates in percentage (0-100)."""
|
||||
row: int
|
||||
col: int
|
||||
x: float
|
||||
y: float
|
||||
width: float
|
||||
height: float
|
||||
text: str = ""
|
||||
confidence: float = 0.0
|
||||
status: CellStatus = CellStatus.EMPTY
|
||||
column_type: ColumnType = ColumnType.UNKNOWN
|
||||
logical_row: int = 0
|
||||
logical_col: int = 0
|
||||
is_continuation: bool = False
|
||||
|
||||
@property
|
||||
def x_mm(self) -> float:
|
||||
return round(self.x / 100 * A4_WIDTH_MM, 1)
|
||||
|
||||
@property
|
||||
def y_mm(self) -> float:
|
||||
return round(self.y / 100 * A4_HEIGHT_MM, 1)
|
||||
|
||||
@property
|
||||
def width_mm(self) -> float:
|
||||
return round(self.width / 100 * A4_WIDTH_MM, 1)
|
||||
|
||||
@property
|
||||
def height_mm(self) -> float:
|
||||
return round(self.height / 100 * A4_HEIGHT_MM, 2)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"row": self.row,
|
||||
"col": self.col,
|
||||
"x": round(self.x, 2),
|
||||
"y": round(self.y, 2),
|
||||
"width": round(self.width, 2),
|
||||
"height": round(self.height, 2),
|
||||
"x_mm": self.x_mm,
|
||||
"y_mm": self.y_mm,
|
||||
"width_mm": self.width_mm,
|
||||
"height_mm": self.height_mm,
|
||||
"text": self.text,
|
||||
"confidence": self.confidence,
|
||||
"status": self.status.value,
|
||||
"column_type": self.column_type.value,
|
||||
"logical_row": self.logical_row,
|
||||
"logical_col": self.logical_col,
|
||||
"is_continuation": self.is_continuation,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class GridResult:
|
||||
"""Result of grid detection."""
|
||||
rows: int = 0
|
||||
columns: int = 0
|
||||
cells: List[List[GridCell]] = field(default_factory=list)
|
||||
column_types: List[str] = field(default_factory=list)
|
||||
column_boundaries: List[float] = field(default_factory=list)
|
||||
row_boundaries: List[float] = field(default_factory=list)
|
||||
deskew_angle: float = 0.0
|
||||
stats: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
cells_dicts = []
|
||||
for row_cells in self.cells:
|
||||
cells_dicts.append([c.to_dict() for c in row_cells])
|
||||
|
||||
return {
|
||||
"rows": self.rows,
|
||||
"columns": self.columns,
|
||||
"cells": cells_dicts,
|
||||
"column_types": self.column_types,
|
||||
"column_boundaries": [round(b, 2) for b in self.column_boundaries],
|
||||
"row_boundaries": [round(b, 2) for b in self.row_boundaries],
|
||||
"deskew_angle": round(self.deskew_angle, 2),
|
||||
"stats": self.stats,
|
||||
"page_dimensions": {
|
||||
"width_mm": A4_WIDTH_MM,
|
||||
"height_mm": A4_HEIGHT_MM,
|
||||
"format": "A4",
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user