[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions

View File

@@ -10,166 +10,21 @@ Lizenz: Apache 2.0 (kommerziell nutzbar)
import math
import logging
from enum import Enum
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Tuple
from typing import List
from .grid_detection_models import (
A4_WIDTH_MM,
A4_HEIGHT_MM,
COLUMN_MARGIN_MM,
CellStatus,
ColumnType,
OCRRegion,
GridCell,
GridResult,
)
logger = logging.getLogger(__name__)
# A4 dimensions
A4_WIDTH_MM = 210.0
A4_HEIGHT_MM = 297.0
# Column margin (1mm)
COLUMN_MARGIN_MM = 1.0
COLUMN_MARGIN_PCT = (COLUMN_MARGIN_MM / A4_WIDTH_MM) * 100
class CellStatus(str, Enum):
EMPTY = "empty"
RECOGNIZED = "recognized"
PROBLEMATIC = "problematic"
MANUAL = "manual"
class ColumnType(str, Enum):
ENGLISH = "english"
GERMAN = "german"
EXAMPLE = "example"
UNKNOWN = "unknown"
@dataclass
class OCRRegion:
"""A word/phrase detected by OCR with bounding box coordinates in percentage (0-100)."""
text: str
confidence: float
x: float # X position as percentage of page width
y: float # Y position as percentage of page height
width: float # Width as percentage of page width
height: float # Height as percentage of page height
@property
def x_mm(self) -> float:
return round(self.x / 100 * A4_WIDTH_MM, 1)
@property
def y_mm(self) -> float:
return round(self.y / 100 * A4_HEIGHT_MM, 1)
@property
def width_mm(self) -> float:
return round(self.width / 100 * A4_WIDTH_MM, 1)
@property
def height_mm(self) -> float:
return round(self.height / 100 * A4_HEIGHT_MM, 2)
@property
def center_x(self) -> float:
return self.x + self.width / 2
@property
def center_y(self) -> float:
return self.y + self.height / 2
@property
def right(self) -> float:
return self.x + self.width
@property
def bottom(self) -> float:
return self.y + self.height
@dataclass
class GridCell:
"""A cell in the detected grid with coordinates in percentage (0-100)."""
row: int
col: int
x: float
y: float
width: float
height: float
text: str = ""
confidence: float = 0.0
status: CellStatus = CellStatus.EMPTY
column_type: ColumnType = ColumnType.UNKNOWN
logical_row: int = 0
logical_col: int = 0
is_continuation: bool = False
@property
def x_mm(self) -> float:
return round(self.x / 100 * A4_WIDTH_MM, 1)
@property
def y_mm(self) -> float:
return round(self.y / 100 * A4_HEIGHT_MM, 1)
@property
def width_mm(self) -> float:
return round(self.width / 100 * A4_WIDTH_MM, 1)
@property
def height_mm(self) -> float:
return round(self.height / 100 * A4_HEIGHT_MM, 2)
def to_dict(self) -> dict:
return {
"row": self.row,
"col": self.col,
"x": round(self.x, 2),
"y": round(self.y, 2),
"width": round(self.width, 2),
"height": round(self.height, 2),
"x_mm": self.x_mm,
"y_mm": self.y_mm,
"width_mm": self.width_mm,
"height_mm": self.height_mm,
"text": self.text,
"confidence": self.confidence,
"status": self.status.value,
"column_type": self.column_type.value,
"logical_row": self.logical_row,
"logical_col": self.logical_col,
"is_continuation": self.is_continuation,
}
@dataclass
class GridResult:
"""Result of grid detection."""
rows: int = 0
columns: int = 0
cells: List[List[GridCell]] = field(default_factory=list)
column_types: List[str] = field(default_factory=list)
column_boundaries: List[float] = field(default_factory=list)
row_boundaries: List[float] = field(default_factory=list)
deskew_angle: float = 0.0
stats: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict:
cells_dicts = []
for row_cells in self.cells:
cells_dicts.append([c.to_dict() for c in row_cells])
return {
"rows": self.rows,
"columns": self.columns,
"cells": cells_dicts,
"column_types": self.column_types,
"column_boundaries": [round(b, 2) for b in self.column_boundaries],
"row_boundaries": [round(b, 2) for b in self.row_boundaries],
"deskew_angle": round(self.deskew_angle, 2),
"stats": self.stats,
"page_dimensions": {
"width_mm": A4_WIDTH_MM,
"height_mm": A4_HEIGHT_MM,
"format": "A4",
},
}
class GridDetectionService:
"""Detect grid/table structure from OCR bounding-box regions."""
@@ -184,7 +39,7 @@ class GridDetectionService:
"""Calculate page skew angle from OCR region positions.
Uses left-edge alignment of regions to detect consistent tilt.
Returns angle in degrees, clamped to ±5°.
Returns angle in degrees, clamped to +/-5 degrees.
"""
if len(regions) < 3:
return 0.0
@@ -229,12 +84,12 @@ class GridDetectionService:
slope = (n * sum_xy - sum_y * sum_x) / denom
# Convert slope to angle (slope is dx/dy in percent space)
# Adjust for aspect ratio: A4 is 210/297 0.707
# Adjust for aspect ratio: A4 is 210/297 ~ 0.707
aspect = A4_WIDTH_MM / A4_HEIGHT_MM
angle_rad = math.atan(slope * aspect)
angle_deg = math.degrees(angle_rad)
# Clamp to ±5°
# Clamp to +/-5 degrees
return max(-5.0, min(5.0, round(angle_deg, 2)))
def apply_deskew_to_regions(self, regions: List[OCRRegion], angle: float) -> List[OCRRegion]: