breakpilot-lehrer/klausur-service/backend/tests/test_cv_vocab_pipeline.py

"""
Unit Tests for CV Vocab Pipeline (cv_vocab_pipeline.py)

Tests cover:
- Data classes (PageRegion, VocabRow, PipelineResult)
- Stage 2: Deskew image
- Stage 3: Dewarp (pass-through)
- Stage 4: Image preparation (OCR + Layout images)
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
- Stage 6: Multi-pass OCR region handling
- Stage 7: Line grouping and vocabulary matching
- Orchestrator (run_cv_pipeline)

DSGVO Note: All tests run locally with synthetic data. No external API calls.
"""

import pytest
import numpy as np
from unittest.mock import AsyncMock, MagicMock, patch, PropertyMock
from dataclasses import asdict

# Import module under test
from cv_vocab_pipeline import (
    PageRegion,
    VocabRow,
    PipelineResult,
    deskew_image,
    dewarp_image,
    create_ocr_image,
    create_layout_image,
    _find_content_bounds,
    analyze_layout,
    _group_words_into_lines,
    match_lines_to_vocab,
    run_cv_pipeline,
    CV2_AVAILABLE,
    TESSERACT_AVAILABLE,
    CV_PIPELINE_AVAILABLE,
)


# =============================================
# FIXTURES
# =============================================

@pytest.fixture
def white_image():
    """Create a simple 300x200 white BGR image."""
    return np.ones((200, 300, 3), dtype=np.uint8) * 255


@pytest.fixture
def text_like_image():
    """Create a 600x400 image with dark text-like regions simulating 3 columns."""
    img = np.ones((400, 600, 3), dtype=np.uint8) * 255

    # Column 1 (EN): x=20..170
    for y in range(50, 350, 30):
        img[y:y+15, 30:160, :] = 30  # Dark text lines

    # Gap between col1 and col2: x=170..210 (white)

    # Column 2 (DE): x=210..370
    for y in range(50, 350, 30):
        img[y:y+15, 220:360, :] = 30

    # Gap between col2 and col3: x=370..410 (white)

    # Column 3 (Example): x=410..580
    for y in range(50, 350, 30):
        img[y:y+15, 420:570, :] = 30

    return img


@pytest.fixture
def binary_image():
    """Create a binary (single-channel) image for OCR tests."""
    # White background (255) with some black text-like areas
    img = np.ones((400, 600), dtype=np.uint8) * 255
    # Add text-like dark bands
    for y in range(50, 350, 30):
        img[y:y+15, 30:570] = 0
    return img


@pytest.fixture
def sample_words_column_en():
    """Sample OCR word dicts for English column."""
    return [
        {'text': 'achieve', 'left': 30, 'top': 50, 'width': 80, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
        {'text': 'improve', 'left': 30, 'top': 80, 'width': 80, 'height': 15, 'conf': 85, 'region_type': 'column_en'},
        {'text': 'success', 'left': 30, 'top': 110, 'width': 80, 'height': 15, 'conf': 92, 'region_type': 'column_en'},
    ]


@pytest.fixture
def sample_words_column_de():
    """Sample OCR word dicts for German column."""
    return [
        {'text': 'erreichen', 'left': 220, 'top': 52, 'width': 100, 'height': 15, 'conf': 88, 'region_type': 'column_de'},
        {'text': 'verbessern', 'left': 220, 'top': 82, 'width': 100, 'height': 15, 'conf': 80, 'region_type': 'column_de'},
        {'text': 'Erfolg', 'left': 220, 'top': 112, 'width': 100, 'height': 15, 'conf': 95, 'region_type': 'column_de'},
    ]


@pytest.fixture
def sample_words_column_ex():
    """Sample OCR word dicts for Example column."""
    return [
        {'text': 'She', 'left': 420, 'top': 50, 'width': 30, 'height': 15, 'conf': 85, 'region_type': 'column_example'},
        {'text': 'achieved', 'left': 455, 'top': 50, 'width': 70, 'height': 15, 'conf': 80, 'region_type': 'column_example'},
        {'text': 'her', 'left': 530, 'top': 50, 'width': 30, 'height': 15, 'conf': 90, 'region_type': 'column_example'},
        {'text': 'goals.', 'left': 420, 'top': 52, 'width': 50, 'height': 15, 'conf': 75, 'region_type': 'column_example'},
    ]


@pytest.fixture
def sample_regions():
    """Sample 3-column PageRegion layout."""
    return [
        PageRegion(type='column_en', x=0, y=50, width=190, height=300),
        PageRegion(type='column_de', x=210, y=50, width=160, height=300),
        PageRegion(type='column_example', x=410, y=50, width=190, height=300),
    ]


# =============================================
# DATA CLASS TESTS
# =============================================

class TestDataClasses:
    """Test data classes for correct defaults and fields."""

    def test_page_region_creation(self):
        region = PageRegion(type='column_en', x=10, y=20, width=100, height=200)
        assert region.type == 'column_en'
        assert region.x == 10
        assert region.y == 20
        assert region.width == 100
        assert region.height == 200

    def test_vocab_row_defaults(self):
        row = VocabRow()
        assert row.english == ""
        assert row.german == ""
        assert row.example == ""
        assert row.confidence == 0.0
        assert row.y_position == 0

    def test_vocab_row_with_values(self):
        row = VocabRow(english="test", german="Test", example="A test.", confidence=85.5, y_position=100)
        assert row.english == "test"
        assert row.german == "Test"
        assert row.confidence == 85.5

    def test_pipeline_result_defaults(self):
        result = PipelineResult()
        assert result.vocabulary == []
        assert result.word_count == 0
        assert result.columns_detected == 0
        assert result.duration_seconds == 0.0
        assert result.stages == {}
        assert result.error is None

    def test_pipeline_result_error(self):
        result = PipelineResult(error="Something went wrong")
        assert result.error == "Something went wrong"


# =============================================
# STAGE 2: DESKEW TESTS
# =============================================

@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestDeskew:
    """Test deskew (rotation correction) stage."""

    def test_deskew_straight_image(self, white_image):
        """A perfectly straight image should not be rotated."""
        corrected, angle = deskew_image(white_image)
        assert abs(angle) < 0.1
        assert corrected.shape == white_image.shape

    def test_deskew_returns_tuple(self, white_image):
        """deskew_image must return (image, angle) tuple."""
        result = deskew_image(white_image)
        assert isinstance(result, tuple)
        assert len(result) == 2
        assert isinstance(result[0], np.ndarray)
        assert isinstance(result[1], float)

    def test_deskew_preserves_shape(self, text_like_image):
        """Output image should have same shape as input."""
        corrected, _ = deskew_image(text_like_image)
        assert corrected.shape == text_like_image.shape


# =============================================
# STAGE 3: DEWARP TESTS
# =============================================

@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestDewarp:
    """Test dewarp (pass-through) stage."""

    def test_dewarp_passthrough(self, white_image):
        """Current dewarp should return the same image (pass-through)."""
        result = dewarp_image(white_image)
        np.testing.assert_array_equal(result, white_image)

    def test_dewarp_preserves_shape(self, text_like_image):
        result = dewarp_image(text_like_image)
        assert result.shape == text_like_image.shape


# =============================================
# STAGE 4: IMAGE PREPARATION TESTS
# =============================================

@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestImagePreparation:
    """Test OCR and layout image creation."""

    def test_create_ocr_image_returns_grayscale(self, text_like_image):
        """OCR image should be single-channel (binarized)."""
        ocr_img = create_ocr_image(text_like_image)
        assert len(ocr_img.shape) == 2  # Single channel
        assert ocr_img.dtype == np.uint8

    def test_create_ocr_image_is_binary(self, text_like_image):
        """OCR image should contain only 0 and 255 values."""
        ocr_img = create_ocr_image(text_like_image)
        unique_vals = np.unique(ocr_img)
        assert all(v in [0, 255] for v in unique_vals)

    def test_create_layout_image_returns_grayscale(self, text_like_image):
        """Layout image should be single-channel (CLAHE enhanced)."""
        layout_img = create_layout_image(text_like_image)
        assert len(layout_img.shape) == 2
        assert layout_img.dtype == np.uint8

    def test_create_layout_image_enhanced_contrast(self, text_like_image):
        """Layout image should have different histogram than simple grayscale."""
        import cv2
        gray = cv2.cvtColor(text_like_image, cv2.COLOR_BGR2GRAY)
        layout_img = create_layout_image(text_like_image)
        # CLAHE should change the histogram
        assert layout_img.shape == gray.shape


# =============================================
# STAGE 5: LAYOUT ANALYSIS TESTS
# =============================================

@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestContentBounds:
    """Test _find_content_bounds helper."""

    def test_empty_image(self):
        """Fully white (inverted = black) image should return full bounds."""
        inv = np.zeros((200, 300), dtype=np.uint8)
        left, right, top, bottom = _find_content_bounds(inv)
        # With no content, bounds should span the image
        assert left >= 0
        assert right <= 300
        assert top >= 0
        assert bottom <= 200

    def test_centered_content(self):
        """Content in center should give tight bounds."""
        inv = np.zeros((400, 600), dtype=np.uint8)
        # Add content block in center
        inv[100:300, 50:550] = 255
        left, right, top, bottom = _find_content_bounds(inv)
        assert left <= 52   # ~50 with 2px margin
        assert right >= 548  # ~550 with 2px margin
        assert top <= 102
        assert bottom >= 298


@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestLayoutAnalysis:
    """Test analyze_layout for column detection."""

    def test_returns_list_of_regions(self, text_like_image):
        """analyze_layout should return a list of PageRegion."""
        ocr_img = create_ocr_image(text_like_image)
        layout_img = create_layout_image(text_like_image)
        regions = analyze_layout(layout_img, ocr_img)
        assert isinstance(regions, list)
        assert all(isinstance(r, PageRegion) for r in regions)

    def test_detects_columns(self, text_like_image):
        """With clear 3-column image, should detect at least 1 column."""
        ocr_img = create_ocr_image(text_like_image)
        layout_img = create_layout_image(text_like_image)
        regions = analyze_layout(layout_img, ocr_img)
        column_regions = [r for r in regions if r.type.startswith('column')]
        assert len(column_regions) >= 1

    def test_single_column_fallback(self):
        """Image with no clear columns should fall back to single column."""
        # Uniform text across full width
        img = np.ones((400, 600, 3), dtype=np.uint8) * 255
        for y in range(50, 350, 20):
            img[y:y+10, 20:580, :] = 30  # Full-width text
        ocr_img = create_ocr_image(img)
        layout_img = create_layout_image(img)
        regions = analyze_layout(layout_img, ocr_img)
        column_regions = [r for r in regions if r.type.startswith('column')]
        # Should at least return 1 column (full page fallback)
        assert len(column_regions) >= 1

    def test_region_types_are_valid(self, text_like_image):
        """All region types should be from the expected set."""
        ocr_img = create_ocr_image(text_like_image)
        layout_img = create_layout_image(text_like_image)
        regions = analyze_layout(layout_img, ocr_img)
        valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'}
        for r in regions:
            assert r.type in valid_types, f"Unexpected region type: {r.type}"


# =============================================
# STAGE 7: LINE GROUPING TESTS
# =============================================

class TestLineGrouping:
    """Test _group_words_into_lines function."""

    def test_empty_input(self):
        """Empty word list should return empty lines."""
        assert _group_words_into_lines([]) == []

    def test_single_word(self):
        """Single word should return one line with one word."""
        words = [{'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}]
        lines = _group_words_into_lines(words)
        assert len(lines) == 1
        assert len(lines[0]) == 1
        assert lines[0][0]['text'] == 'hello'

    def test_words_on_same_line(self):
        """Words close in Y should be grouped into one line."""
        words = [
            {'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90},
            {'text': 'world', 'left': 70, 'top': 52, 'width': 50, 'height': 15, 'conf': 85},
        ]
        lines = _group_words_into_lines(words, y_tolerance_px=10)
        assert len(lines) == 1
        assert len(lines[0]) == 2

    def test_words_on_different_lines(self):
        """Words far apart in Y should be on different lines."""
        words = [
            {'text': 'line1', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90},
            {'text': 'line2', 'left': 10, 'top': 100, 'width': 50, 'height': 15, 'conf': 85},
            {'text': 'line3', 'left': 10, 'top': 150, 'width': 50, 'height': 15, 'conf': 88},
        ]
        lines = _group_words_into_lines(words, y_tolerance_px=20)
        assert len(lines) == 3

    def test_words_sorted_by_x_within_line(self):
        """Words within a line should be sorted by X position."""
        words = [
            {'text': 'world', 'left': 100, 'top': 50, 'width': 50, 'height': 15, 'conf': 85},
            {'text': 'hello', 'left': 10, 'top': 52, 'width': 50, 'height': 15, 'conf': 90},
        ]
        lines = _group_words_into_lines(words, y_tolerance_px=10)
        assert len(lines) == 1
        assert lines[0][0]['text'] == 'hello'
        assert lines[0][1]['text'] == 'world'


# =============================================
# STAGE 7: VOCABULARY MATCHING TESTS
# =============================================

class TestVocabMatching:
    """Test match_lines_to_vocab function."""

    def test_empty_results(self, sample_regions):
        """Empty OCR results should return empty vocab."""
        vocab = match_lines_to_vocab({}, sample_regions)
        assert vocab == []

    def test_en_only(self, sample_words_column_en, sample_regions):
        """Only EN words should create entries with empty DE/example."""
        ocr_results = {'column_en': sample_words_column_en}
        vocab = match_lines_to_vocab(ocr_results, sample_regions)
        assert len(vocab) == 3
        for row in vocab:
            assert row.english != ""
            assert row.german == ""

    def test_en_de_matching(self, sample_words_column_en, sample_words_column_de, sample_regions):
        """EN and DE words on same Y should be matched."""
        ocr_results = {
            'column_en': sample_words_column_en,
            'column_de': sample_words_column_de,
        }
        vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
        assert len(vocab) == 3
        # First entry should match achieve <-> erreichen
        assert vocab[0].english == 'achieve'
        assert vocab[0].german == 'erreichen'

    def test_full_3_column_matching(self, sample_words_column_en, sample_words_column_de,
                                      sample_words_column_ex, sample_regions):
        """All 3 columns should be matched by Y coordinate."""
        ocr_results = {
            'column_en': sample_words_column_en,
            'column_de': sample_words_column_de,
            'column_example': sample_words_column_ex,
        }
        vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
        assert len(vocab) >= 1
        # First entry should have example text
        assert vocab[0].english == 'achieve'
        assert vocab[0].example != ""

    def test_sorted_by_y_position(self, sample_words_column_en, sample_regions):
        """Result should be sorted by Y position."""
        ocr_results = {'column_en': sample_words_column_en}
        vocab = match_lines_to_vocab(ocr_results, sample_regions)
        positions = [row.y_position for row in vocab]
        assert positions == sorted(positions)

    def test_skips_short_entries(self, sample_regions):
        """Very short text (< 2 chars) should be skipped."""
        words = [
            {'text': 'a', 'left': 30, 'top': 50, 'width': 10, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
            {'text': 'valid', 'left': 30, 'top': 80, 'width': 50, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
        ]
        ocr_results = {'column_en': words}
        vocab = match_lines_to_vocab(ocr_results, sample_regions)
        assert len(vocab) == 1
        assert vocab[0].english == 'valid'

    def test_confidence_calculation(self, sample_words_column_en, sample_words_column_de, sample_regions):
        """Confidence should be the average of matched columns."""
        ocr_results = {
            'column_en': sample_words_column_en,
            'column_de': sample_words_column_de,
        }
        vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
        # First entry: EN conf=90, DE conf=88 → avg=89
        assert vocab[0].confidence > 0
        assert vocab[0].confidence == pytest.approx(89.0, abs=1.0)


# =============================================
# ORCHESTRATOR TESTS
# =============================================

class TestOrchestrator:
    """Test run_cv_pipeline orchestrator."""

    @pytest.mark.asyncio
    async def test_no_input_returns_error(self):
        """Pipeline without input should return error."""
        result = await run_cv_pipeline()
        assert result.error is not None
        assert "No input data" in result.error

    @pytest.mark.asyncio
    async def test_pipeline_unavailable(self):
        """When CV_PIPELINE_AVAILABLE is False, should return error."""
        with patch('cv_vocab_pipeline.CV_PIPELINE_AVAILABLE', False):
            result = await run_cv_pipeline(pdf_data=b"fake")
            assert result.error is not None
            assert "not available" in result.error

    @pytest.mark.asyncio
    @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
    async def test_pipeline_with_image_data(self):
        """Pipeline with a real synthetic image should run without errors."""
        import cv2
        # Create a simple test image (white with some text-like black bars)
        img = np.ones((200, 300, 3), dtype=np.uint8) * 255
        for y in range(30, 170, 25):
            img[y:y+12, 20:280, :] = 30
        _, img_bytes = cv2.imencode('.png', img)
        image_data = img_bytes.tobytes()

        with patch('cv_vocab_pipeline.pytesseract') as mock_tess:
            # Mock Tesseract to return empty results
            mock_tess.image_to_data.return_value = {
                'text': [], 'conf': [], 'left': [], 'top': [],
                'width': [], 'height': [],
            }
            mock_tess.Output.DICT = 'dict'

            result = await run_cv_pipeline(image_data=image_data)
            assert result.error is None
            assert result.image_width == 300
            assert result.image_height == 200
            assert 'render' in result.stages
            assert 'deskew' in result.stages

    @pytest.mark.asyncio
    @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
    async def test_pipeline_records_timing(self):
        """Pipeline should record timing for each stage."""
        import cv2
        img = np.ones((100, 150, 3), dtype=np.uint8) * 255
        _, img_bytes = cv2.imencode('.png', img)

        with patch('cv_vocab_pipeline.pytesseract') as mock_tess:
            mock_tess.image_to_data.return_value = {
                'text': [], 'conf': [], 'left': [], 'top': [],
                'width': [], 'height': [],
            }
            mock_tess.Output.DICT = 'dict'

            result = await run_cv_pipeline(image_data=img_bytes.tobytes())
            assert result.duration_seconds >= 0
            assert all(v >= 0 for v in result.stages.values())

    @pytest.mark.asyncio
    async def test_pipeline_result_format(self):
        """PipelineResult vocabulary should be list of dicts with expected keys."""
        result = PipelineResult()
        result.vocabulary = [
            {"english": "test", "german": "Test", "example": "A test.", "confidence": 90.0}
        ]
        assert len(result.vocabulary) == 1
        entry = result.vocabulary[0]
        assert "english" in entry
        assert "german" in entry
        assert "example" in entry
        assert "confidence" in entry


# =============================================
# INTEGRATION-STYLE TESTS (with mocked Tesseract)
# =============================================

@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestStageIntegration:
    """Test multiple stages together (still unit-test level with mocked OCR)."""

    def test_image_prep_to_layout(self, text_like_image):
        """Stages 4→5: image prep feeds layout analysis correctly."""
        ocr_img = create_ocr_image(text_like_image)
        layout_img = create_layout_image(text_like_image)

        assert ocr_img.shape[:2] == text_like_image.shape[:2]
        assert layout_img.shape[:2] == text_like_image.shape[:2]

        regions = analyze_layout(layout_img, ocr_img)
        assert len(regions) >= 1

    def test_deskew_to_image_prep(self, text_like_image):
        """Stages 2→4: deskew output can be processed by image prep."""
        corrected, angle = deskew_image(text_like_image)
        ocr_img = create_ocr_image(corrected)
        layout_img = create_layout_image(corrected)
        assert ocr_img.shape[:2] == corrected.shape[:2]
        assert layout_img.shape[:2] == corrected.shape[:2]


# =============================================
# RUN TESTS
# =============================================

if __name__ == "__main__":
    pytest.main([__file__, "-v"])