""" Tests for Grid Detection Service v4 Tests cover: - mm coordinate conversion - Deskew calculation - Column detection with 1mm margin - Data class functionality Lizenz: Apache 2.0 (kommerziell nutzbar) """ import pytest import math from typing import List # Import the service under test import sys sys.path.insert(0, '/app') from services.grid_detection_service import ( GridDetectionService, OCRRegion, GridCell, CellStatus, ColumnType, A4_WIDTH_MM, A4_HEIGHT_MM, COLUMN_MARGIN_MM, COLUMN_MARGIN_PCT ) class TestOCRRegionMMConversion: """Test mm coordinate conversion for OCR regions.""" def test_x_mm_conversion(self): """Test X coordinate conversion from percent to mm.""" # 50% of A4 width = 105mm region = OCRRegion(text="test", confidence=0.9, x=50.0, y=0.0, width=10.0, height=5.0) assert region.x_mm == 105.0 def test_y_mm_conversion(self): """Test Y coordinate conversion from percent to mm.""" # 33.33% of A4 height = 99mm (approx) region = OCRRegion(text="test", confidence=0.9, x=0.0, y=33.33, width=10.0, height=5.0) assert abs(region.y_mm - 99.0) < 0.5 def test_width_mm_conversion(self): """Test width conversion from percent to mm.""" # 10% of A4 width = 21mm region = OCRRegion(text="test", confidence=0.9, x=0.0, y=0.0, width=10.0, height=5.0) assert region.width_mm == 21.0 def test_height_mm_conversion(self): """Test height conversion from percent to mm.""" # 5% of A4 height = 14.85mm region = OCRRegion(text="test", confidence=0.9, x=0.0, y=0.0, width=10.0, height=5.0) assert abs(region.height_mm - 14.85) < 0.01 def test_center_coordinates(self): """Test center coordinate calculation.""" region = OCRRegion(text="test", confidence=0.9, x=10.0, y=20.0, width=20.0, height=10.0) assert region.center_x == 20.0 assert region.center_y == 25.0 def test_right_bottom_edges(self): """Test right and bottom edge calculation.""" region = OCRRegion(text="test", confidence=0.9, x=10.0, y=20.0, width=30.0, height=15.0) assert region.right == 40.0 assert region.bottom == 35.0 class TestGridCellMMConversion: """Test mm coordinate conversion for grid cells.""" def test_cell_to_dict_includes_mm(self): """Test that to_dict includes mm coordinates.""" cell = GridCell(row=0, col=0, x=10.0, y=20.0, width=30.0, height=5.0, text="hello") result = cell.to_dict() assert "x_mm" in result assert "y_mm" in result assert "width_mm" in result assert "height_mm" in result # 10% of 210mm = 21mm assert result["x_mm"] == 21.0 # 20% of 297mm = 59.4mm assert result["y_mm"] == 59.4 def test_cell_mm_coordinates(self): """Test direct mm property access.""" cell = GridCell(row=0, col=0, x=50.0, y=50.0, width=20.0, height=3.0) assert cell.x_mm == 105.0 # 50% of 210mm assert cell.y_mm == 148.5 # 50% of 297mm assert cell.width_mm == 42.0 # 20% of 210mm assert abs(cell.height_mm - 8.91) < 0.01 # 3% of 297mm def test_cell_to_dict_includes_all_fields(self): """Test that to_dict includes all expected fields.""" cell = GridCell( row=1, col=2, x=10.0, y=20.0, width=30.0, height=5.0, text="test", confidence=0.95, status=CellStatus.RECOGNIZED, column_type=ColumnType.ENGLISH, logical_row=0, logical_col=0, is_continuation=False ) result = cell.to_dict() assert result["row"] == 1 assert result["col"] == 2 assert result["text"] == "test" assert result["confidence"] == 0.95 assert result["status"] == "recognized" assert result["column_type"] == "english" assert result["logical_row"] == 0 assert result["logical_col"] == 0 assert result["is_continuation"] == False class TestA4Constants: """Test A4 dimension constants.""" def test_a4_width_mm(self): """Verify A4 width is 210mm.""" assert A4_WIDTH_MM == 210.0 def test_a4_height_mm(self): """Verify A4 height is 297mm.""" assert A4_HEIGHT_MM == 297.0 def test_column_margin_mm(self): """Verify column margin is 1mm.""" assert COLUMN_MARGIN_MM == 1.0 def test_column_margin_percent(self): """Verify column margin percentage calculation.""" expected = (1.0 / 210.0) * 100 assert abs(COLUMN_MARGIN_PCT - expected) < 0.001 class TestGridDetectionServiceInit: """Test GridDetectionService initialization.""" def test_init_with_defaults(self): """Test service initializes with default parameters.""" service = GridDetectionService() assert service.y_tolerance_pct == 1.5 assert service.padding_pct == 0.3 assert service.column_margin_mm == COLUMN_MARGIN_MM def test_init_with_custom_params(self): """Test service initializes with custom parameters.""" service = GridDetectionService( y_tolerance_pct=2.0, padding_pct=0.5, column_margin_mm=2.0 ) assert service.y_tolerance_pct == 2.0 assert service.padding_pct == 0.5 assert service.column_margin_mm == 2.0 class TestDeskewCalculation: """Test deskew angle calculation.""" def test_calculate_deskew_no_regions(self): """Test deskew returns 0 for empty regions.""" service = GridDetectionService() angle = service.calculate_deskew_angle([]) assert angle == 0.0 def test_calculate_deskew_few_regions(self): """Test deskew returns 0 for too few regions.""" service = GridDetectionService() regions = [ OCRRegion(text="a", confidence=0.9, x=10.0, y=10.0, width=5.0, height=2.0), ] angle = service.calculate_deskew_angle(regions) assert angle == 0.0 def test_calculate_deskew_perfectly_aligned(self): """Test deskew returns near-zero for perfectly aligned text.""" service = GridDetectionService() # Perfectly vertical alignment at x=10% regions = [ OCRRegion(text="a", confidence=0.9, x=10.0, y=10.0, width=5.0, height=2.0), OCRRegion(text="b", confidence=0.9, x=10.0, y=20.0, width=5.0, height=2.0), OCRRegion(text="c", confidence=0.9, x=10.0, y=30.0, width=5.0, height=2.0), OCRRegion(text="d", confidence=0.9, x=10.0, y=40.0, width=5.0, height=2.0), OCRRegion(text="e", confidence=0.9, x=10.0, y=50.0, width=5.0, height=2.0), ] angle = service.calculate_deskew_angle(regions) assert abs(angle) < 0.5 # Should be very close to 0 def test_calculate_deskew_tilted_right(self): """Test deskew detects right tilt.""" service = GridDetectionService() # Text tilts right as we go down (x increases with y) regions = [ OCRRegion(text="a", confidence=0.9, x=10.0, y=10.0, width=5.0, height=2.0), OCRRegion(text="b", confidence=0.9, x=11.0, y=20.0, width=5.0, height=2.0), OCRRegion(text="c", confidence=0.9, x=12.0, y=30.0, width=5.0, height=2.0), OCRRegion(text="d", confidence=0.9, x=13.0, y=40.0, width=5.0, height=2.0), OCRRegion(text="e", confidence=0.9, x=14.0, y=50.0, width=5.0, height=2.0), ] angle = service.calculate_deskew_angle(regions) assert angle > 0 # Positive angle for right tilt def test_calculate_deskew_max_angle(self): """Test deskew is clamped to max 5 degrees.""" service = GridDetectionService() # Extreme tilt regions = [ OCRRegion(text="a", confidence=0.9, x=5.0, y=10.0, width=5.0, height=2.0), OCRRegion(text="b", confidence=0.9, x=15.0, y=20.0, width=5.0, height=2.0), OCRRegion(text="c", confidence=0.9, x=25.0, y=30.0, width=5.0, height=2.0), OCRRegion(text="d", confidence=0.9, x=35.0, y=40.0, width=5.0, height=2.0), OCRRegion(text="e", confidence=0.9, x=45.0, y=50.0, width=5.0, height=2.0), ] angle = service.calculate_deskew_angle(regions) assert abs(angle) <= 5.0 # Clamped to ±5° class TestDeskewApplication: """Test deskew coordinate transformation.""" def test_apply_deskew_zero_angle(self): """Test no transformation for zero angle.""" service = GridDetectionService() regions = [ OCRRegion(text="a", confidence=0.9, x=10.0, y=20.0, width=5.0, height=2.0), ] result = service.apply_deskew_to_regions(regions, 0.0) assert len(result) == 1 assert result[0].x == 10.0 assert result[0].y == 20.0 def test_apply_deskew_preserves_text(self): """Test deskew preserves text and confidence.""" service = GridDetectionService() regions = [ OCRRegion(text="hello", confidence=0.95, x=10.0, y=20.0, width=5.0, height=2.0), ] result = service.apply_deskew_to_regions(regions, 2.0) assert result[0].text == "hello" assert result[0].confidence == 0.95 class TestCellStatus: """Test cell status classification.""" def test_cell_status_empty(self): """Test empty cell status.""" cell = GridCell(row=0, col=0, x=0, y=0, width=10, height=5, text="") assert cell.status == CellStatus.EMPTY def test_cell_status_recognized(self): """Test recognized cell status.""" cell = GridCell( row=0, col=0, x=0, y=0, width=10, height=5, text="hello", confidence=0.9, status=CellStatus.RECOGNIZED ) assert cell.status == CellStatus.RECOGNIZED def test_cell_status_problematic(self): """Test problematic cell (low confidence).""" cell = GridCell( row=0, col=0, x=0, y=0, width=10, height=5, text="hello", confidence=0.3, status=CellStatus.PROBLEMATIC ) assert cell.status == CellStatus.PROBLEMATIC class TestColumnType: """Test column type enum.""" def test_column_type_values(self): """Test column type enum values.""" assert ColumnType.ENGLISH.value == "english" assert ColumnType.GERMAN.value == "german" assert ColumnType.EXAMPLE.value == "example" assert ColumnType.UNKNOWN.value == "unknown" class TestDetectGrid: """Test grid detection functionality.""" def test_detect_grid_empty_regions(self): """Test grid detection with empty regions.""" service = GridDetectionService() result = service.detect_grid([]) assert result.rows == 0 assert result.columns == 0 assert len(result.cells) == 0 def test_detect_grid_single_word(self): """Test grid detection with single word.""" service = GridDetectionService() regions = [ OCRRegion(text="house", confidence=0.9, x=10.0, y=10.0, width=10.0, height=2.0), ] result = service.detect_grid(regions) assert result.rows >= 1 assert result.columns >= 1 def test_detect_grid_result_has_page_dimensions(self): """Test that result includes page dimensions.""" service = GridDetectionService() regions = [ OCRRegion(text="house", confidence=0.9, x=10.0, y=10.0, width=10.0, height=2.0), ] result = service.detect_grid(regions) result_dict = result.to_dict() assert "page_dimensions" in result_dict assert result_dict["page_dimensions"]["width_mm"] == 210.0 assert result_dict["page_dimensions"]["height_mm"] == 297.0 assert result_dict["page_dimensions"]["format"] == "A4" def test_detect_grid_result_has_stats(self): """Test that result includes stats.""" service = GridDetectionService() regions = [ OCRRegion(text="house", confidence=0.9, x=10.0, y=10.0, width=10.0, height=2.0), OCRRegion(text="Haus", confidence=0.8, x=50.0, y=10.0, width=8.0, height=2.0), ] result = service.detect_grid(regions) result_dict = result.to_dict() assert "stats" in result_dict assert "recognized" in result_dict["stats"] assert "coverage" in result_dict["stats"] class TestIntegration: """Integration tests for full analysis pipeline.""" def test_full_vocabulary_table_analysis(self): """Test analysis of a typical vocabulary table.""" service = GridDetectionService() # Simulate a vocabulary table with 3 columns regions = [ # Row 1 OCRRegion(text="house", confidence=0.95, x=10.0, y=15.0, width=12.0, height=2.5), OCRRegion(text="Haus", confidence=0.92, x=45.0, y=15.0, width=8.0, height=2.5), OCRRegion(text="This is a house.", confidence=0.88, x=70.0, y=15.0, width=25.0, height=2.5), # Row 2 OCRRegion(text="car", confidence=0.94, x=10.0, y=22.0, width=8.0, height=2.5), OCRRegion(text="Auto", confidence=0.91, x=45.0, y=22.0, width=9.0, height=2.5), OCRRegion(text="I drive a car.", confidence=0.85, x=70.0, y=22.0, width=22.0, height=2.5), # Row 3 OCRRegion(text="tree", confidence=0.96, x=10.0, y=29.0, width=9.0, height=2.5), OCRRegion(text="Baum", confidence=0.93, x=45.0, y=29.0, width=10.0, height=2.5), OCRRegion(text="The tree is tall.", confidence=0.87, x=70.0, y=29.0, width=24.0, height=2.5), ] result = service.detect_grid(regions) result_dict = result.to_dict() # Verify structure assert "cells" in result_dict assert "page_dimensions" in result_dict assert "stats" in result_dict # Verify page dimensions assert result_dict["page_dimensions"]["format"] == "A4" # Verify cells have mm coordinates if len(result_dict["cells"]) > 0 and len(result_dict["cells"][0]) > 0: cell = result_dict["cells"][0][0] assert "x_mm" in cell assert "y_mm" in cell assert "width_mm" in cell assert "height_mm" in cell if __name__ == "__main__": pytest.main([__file__, "-v"])