This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/klausur-service/backend/tests/test_grid_detection.py
BreakPilot Dev baee45b861 feat(ocr): Add Grid Detection v4 tests, docs, and SBOM update
- Add comprehensive tests for grid_detection_service.py (31 tests)
  - mm coordinate conversion tests
  - Deskew calculation tests
  - Column detection tests
  - Integration tests for vocabulary tables

- Add OCR-Compare documentation (OCR-Compare.md)
  - mm coordinate system documentation
  - Deskew correction documentation
  - Worksheet Editor integration guide
  - API endpoints documentation

- Add TypeScript tests for ocr-integration.ts
  - mm to pixel conversion tests
  - OCR export format tests
  - localStorage operations tests

- Update SBOM to v1.5.0
  - Add OCR Grid Detection System section
  - Document Fabric.js (MIT) for Worksheet Editor
  - Document NumPy and OpenCV usage

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-08 21:31:35 -08:00

386 lines
14 KiB
Python

"""
Tests for Grid Detection Service v4
Tests cover:
- mm coordinate conversion
- Deskew calculation
- Column detection with 1mm margin
- Data class functionality
Lizenz: Apache 2.0 (kommerziell nutzbar)
"""
import pytest
import math
from typing import List
# Import the service under test
import sys
sys.path.insert(0, '/app')
from services.grid_detection_service import (
GridDetectionService,
OCRRegion,
GridCell,
CellStatus,
ColumnType,
A4_WIDTH_MM,
A4_HEIGHT_MM,
COLUMN_MARGIN_MM,
COLUMN_MARGIN_PCT
)
class TestOCRRegionMMConversion:
"""Test mm coordinate conversion for OCR regions."""
def test_x_mm_conversion(self):
"""Test X coordinate conversion from percent to mm."""
# 50% of A4 width = 105mm
region = OCRRegion(text="test", confidence=0.9, x=50.0, y=0.0, width=10.0, height=5.0)
assert region.x_mm == 105.0
def test_y_mm_conversion(self):
"""Test Y coordinate conversion from percent to mm."""
# 33.33% of A4 height = 99mm (approx)
region = OCRRegion(text="test", confidence=0.9, x=0.0, y=33.33, width=10.0, height=5.0)
assert abs(region.y_mm - 99.0) < 0.5
def test_width_mm_conversion(self):
"""Test width conversion from percent to mm."""
# 10% of A4 width = 21mm
region = OCRRegion(text="test", confidence=0.9, x=0.0, y=0.0, width=10.0, height=5.0)
assert region.width_mm == 21.0
def test_height_mm_conversion(self):
"""Test height conversion from percent to mm."""
# 5% of A4 height = 14.85mm
region = OCRRegion(text="test", confidence=0.9, x=0.0, y=0.0, width=10.0, height=5.0)
assert abs(region.height_mm - 14.85) < 0.01
def test_center_coordinates(self):
"""Test center coordinate calculation."""
region = OCRRegion(text="test", confidence=0.9, x=10.0, y=20.0, width=20.0, height=10.0)
assert region.center_x == 20.0
assert region.center_y == 25.0
def test_right_bottom_edges(self):
"""Test right and bottom edge calculation."""
region = OCRRegion(text="test", confidence=0.9, x=10.0, y=20.0, width=30.0, height=15.0)
assert region.right == 40.0
assert region.bottom == 35.0
class TestGridCellMMConversion:
"""Test mm coordinate conversion for grid cells."""
def test_cell_to_dict_includes_mm(self):
"""Test that to_dict includes mm coordinates."""
cell = GridCell(row=0, col=0, x=10.0, y=20.0, width=30.0, height=5.0, text="hello")
result = cell.to_dict()
assert "x_mm" in result
assert "y_mm" in result
assert "width_mm" in result
assert "height_mm" in result
# 10% of 210mm = 21mm
assert result["x_mm"] == 21.0
# 20% of 297mm = 59.4mm
assert result["y_mm"] == 59.4
def test_cell_mm_coordinates(self):
"""Test direct mm property access."""
cell = GridCell(row=0, col=0, x=50.0, y=50.0, width=20.0, height=3.0)
assert cell.x_mm == 105.0 # 50% of 210mm
assert cell.y_mm == 148.5 # 50% of 297mm
assert cell.width_mm == 42.0 # 20% of 210mm
assert abs(cell.height_mm - 8.91) < 0.01 # 3% of 297mm
def test_cell_to_dict_includes_all_fields(self):
"""Test that to_dict includes all expected fields."""
cell = GridCell(
row=1, col=2, x=10.0, y=20.0, width=30.0, height=5.0,
text="test", confidence=0.95, status=CellStatus.RECOGNIZED,
column_type=ColumnType.ENGLISH, logical_row=0, logical_col=0,
is_continuation=False
)
result = cell.to_dict()
assert result["row"] == 1
assert result["col"] == 2
assert result["text"] == "test"
assert result["confidence"] == 0.95
assert result["status"] == "recognized"
assert result["column_type"] == "english"
assert result["logical_row"] == 0
assert result["logical_col"] == 0
assert result["is_continuation"] == False
class TestA4Constants:
"""Test A4 dimension constants."""
def test_a4_width_mm(self):
"""Verify A4 width is 210mm."""
assert A4_WIDTH_MM == 210.0
def test_a4_height_mm(self):
"""Verify A4 height is 297mm."""
assert A4_HEIGHT_MM == 297.0
def test_column_margin_mm(self):
"""Verify column margin is 1mm."""
assert COLUMN_MARGIN_MM == 1.0
def test_column_margin_percent(self):
"""Verify column margin percentage calculation."""
expected = (1.0 / 210.0) * 100
assert abs(COLUMN_MARGIN_PCT - expected) < 0.001
class TestGridDetectionServiceInit:
"""Test GridDetectionService initialization."""
def test_init_with_defaults(self):
"""Test service initializes with default parameters."""
service = GridDetectionService()
assert service.y_tolerance_pct == 1.5
assert service.padding_pct == 0.3
assert service.column_margin_mm == COLUMN_MARGIN_MM
def test_init_with_custom_params(self):
"""Test service initializes with custom parameters."""
service = GridDetectionService(
y_tolerance_pct=2.0,
padding_pct=0.5,
column_margin_mm=2.0
)
assert service.y_tolerance_pct == 2.0
assert service.padding_pct == 0.5
assert service.column_margin_mm == 2.0
class TestDeskewCalculation:
"""Test deskew angle calculation."""
def test_calculate_deskew_no_regions(self):
"""Test deskew returns 0 for empty regions."""
service = GridDetectionService()
angle = service.calculate_deskew_angle([])
assert angle == 0.0
def test_calculate_deskew_few_regions(self):
"""Test deskew returns 0 for too few regions."""
service = GridDetectionService()
regions = [
OCRRegion(text="a", confidence=0.9, x=10.0, y=10.0, width=5.0, height=2.0),
]
angle = service.calculate_deskew_angle(regions)
assert angle == 0.0
def test_calculate_deskew_perfectly_aligned(self):
"""Test deskew returns near-zero for perfectly aligned text."""
service = GridDetectionService()
# Perfectly vertical alignment at x=10%
regions = [
OCRRegion(text="a", confidence=0.9, x=10.0, y=10.0, width=5.0, height=2.0),
OCRRegion(text="b", confidence=0.9, x=10.0, y=20.0, width=5.0, height=2.0),
OCRRegion(text="c", confidence=0.9, x=10.0, y=30.0, width=5.0, height=2.0),
OCRRegion(text="d", confidence=0.9, x=10.0, y=40.0, width=5.0, height=2.0),
OCRRegion(text="e", confidence=0.9, x=10.0, y=50.0, width=5.0, height=2.0),
]
angle = service.calculate_deskew_angle(regions)
assert abs(angle) < 0.5 # Should be very close to 0
def test_calculate_deskew_tilted_right(self):
"""Test deskew detects right tilt."""
service = GridDetectionService()
# Text tilts right as we go down (x increases with y)
regions = [
OCRRegion(text="a", confidence=0.9, x=10.0, y=10.0, width=5.0, height=2.0),
OCRRegion(text="b", confidence=0.9, x=11.0, y=20.0, width=5.0, height=2.0),
OCRRegion(text="c", confidence=0.9, x=12.0, y=30.0, width=5.0, height=2.0),
OCRRegion(text="d", confidence=0.9, x=13.0, y=40.0, width=5.0, height=2.0),
OCRRegion(text="e", confidence=0.9, x=14.0, y=50.0, width=5.0, height=2.0),
]
angle = service.calculate_deskew_angle(regions)
assert angle > 0 # Positive angle for right tilt
def test_calculate_deskew_max_angle(self):
"""Test deskew is clamped to max 5 degrees."""
service = GridDetectionService()
# Extreme tilt
regions = [
OCRRegion(text="a", confidence=0.9, x=5.0, y=10.0, width=5.0, height=2.0),
OCRRegion(text="b", confidence=0.9, x=15.0, y=20.0, width=5.0, height=2.0),
OCRRegion(text="c", confidence=0.9, x=25.0, y=30.0, width=5.0, height=2.0),
OCRRegion(text="d", confidence=0.9, x=35.0, y=40.0, width=5.0, height=2.0),
OCRRegion(text="e", confidence=0.9, x=45.0, y=50.0, width=5.0, height=2.0),
]
angle = service.calculate_deskew_angle(regions)
assert abs(angle) <= 5.0 # Clamped to ±5°
class TestDeskewApplication:
"""Test deskew coordinate transformation."""
def test_apply_deskew_zero_angle(self):
"""Test no transformation for zero angle."""
service = GridDetectionService()
regions = [
OCRRegion(text="a", confidence=0.9, x=10.0, y=20.0, width=5.0, height=2.0),
]
result = service.apply_deskew_to_regions(regions, 0.0)
assert len(result) == 1
assert result[0].x == 10.0
assert result[0].y == 20.0
def test_apply_deskew_preserves_text(self):
"""Test deskew preserves text and confidence."""
service = GridDetectionService()
regions = [
OCRRegion(text="hello", confidence=0.95, x=10.0, y=20.0, width=5.0, height=2.0),
]
result = service.apply_deskew_to_regions(regions, 2.0)
assert result[0].text == "hello"
assert result[0].confidence == 0.95
class TestCellStatus:
"""Test cell status classification."""
def test_cell_status_empty(self):
"""Test empty cell status."""
cell = GridCell(row=0, col=0, x=0, y=0, width=10, height=5, text="")
assert cell.status == CellStatus.EMPTY
def test_cell_status_recognized(self):
"""Test recognized cell status."""
cell = GridCell(
row=0, col=0, x=0, y=0, width=10, height=5,
text="hello", confidence=0.9, status=CellStatus.RECOGNIZED
)
assert cell.status == CellStatus.RECOGNIZED
def test_cell_status_problematic(self):
"""Test problematic cell (low confidence)."""
cell = GridCell(
row=0, col=0, x=0, y=0, width=10, height=5,
text="hello", confidence=0.3, status=CellStatus.PROBLEMATIC
)
assert cell.status == CellStatus.PROBLEMATIC
class TestColumnType:
"""Test column type enum."""
def test_column_type_values(self):
"""Test column type enum values."""
assert ColumnType.ENGLISH.value == "english"
assert ColumnType.GERMAN.value == "german"
assert ColumnType.EXAMPLE.value == "example"
assert ColumnType.UNKNOWN.value == "unknown"
class TestDetectGrid:
"""Test grid detection functionality."""
def test_detect_grid_empty_regions(self):
"""Test grid detection with empty regions."""
service = GridDetectionService()
result = service.detect_grid([])
assert result.rows == 0
assert result.columns == 0
assert len(result.cells) == 0
def test_detect_grid_single_word(self):
"""Test grid detection with single word."""
service = GridDetectionService()
regions = [
OCRRegion(text="house", confidence=0.9, x=10.0, y=10.0, width=10.0, height=2.0),
]
result = service.detect_grid(regions)
assert result.rows >= 1
assert result.columns >= 1
def test_detect_grid_result_has_page_dimensions(self):
"""Test that result includes page dimensions."""
service = GridDetectionService()
regions = [
OCRRegion(text="house", confidence=0.9, x=10.0, y=10.0, width=10.0, height=2.0),
]
result = service.detect_grid(regions)
result_dict = result.to_dict()
assert "page_dimensions" in result_dict
assert result_dict["page_dimensions"]["width_mm"] == 210.0
assert result_dict["page_dimensions"]["height_mm"] == 297.0
assert result_dict["page_dimensions"]["format"] == "A4"
def test_detect_grid_result_has_stats(self):
"""Test that result includes stats."""
service = GridDetectionService()
regions = [
OCRRegion(text="house", confidence=0.9, x=10.0, y=10.0, width=10.0, height=2.0),
OCRRegion(text="Haus", confidence=0.8, x=50.0, y=10.0, width=8.0, height=2.0),
]
result = service.detect_grid(regions)
result_dict = result.to_dict()
assert "stats" in result_dict
assert "recognized" in result_dict["stats"]
assert "coverage" in result_dict["stats"]
class TestIntegration:
"""Integration tests for full analysis pipeline."""
def test_full_vocabulary_table_analysis(self):
"""Test analysis of a typical vocabulary table."""
service = GridDetectionService()
# Simulate a vocabulary table with 3 columns
regions = [
# Row 1
OCRRegion(text="house", confidence=0.95, x=10.0, y=15.0, width=12.0, height=2.5),
OCRRegion(text="Haus", confidence=0.92, x=45.0, y=15.0, width=8.0, height=2.5),
OCRRegion(text="This is a house.", confidence=0.88, x=70.0, y=15.0, width=25.0, height=2.5),
# Row 2
OCRRegion(text="car", confidence=0.94, x=10.0, y=22.0, width=8.0, height=2.5),
OCRRegion(text="Auto", confidence=0.91, x=45.0, y=22.0, width=9.0, height=2.5),
OCRRegion(text="I drive a car.", confidence=0.85, x=70.0, y=22.0, width=22.0, height=2.5),
# Row 3
OCRRegion(text="tree", confidence=0.96, x=10.0, y=29.0, width=9.0, height=2.5),
OCRRegion(text="Baum", confidence=0.93, x=45.0, y=29.0, width=10.0, height=2.5),
OCRRegion(text="The tree is tall.", confidence=0.87, x=70.0, y=29.0, width=24.0, height=2.5),
]
result = service.detect_grid(regions)
result_dict = result.to_dict()
# Verify structure
assert "cells" in result_dict
assert "page_dimensions" in result_dict
assert "stats" in result_dict
# Verify page dimensions
assert result_dict["page_dimensions"]["format"] == "A4"
# Verify cells have mm coordinates
if len(result_dict["cells"]) > 0 and len(result_dict["cells"][0]) > 0:
cell = result_dict["cells"][0][0]
assert "x_mm" in cell
assert "y_mm" in cell
assert "width_mm" in cell
assert "height_mm" in cell
if __name__ == "__main__":
pytest.main([__file__, "-v"])