breakpilot-lehrer/klausur-service/backend/tests/test_grid_editor_api.py

"""
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.

Covers:
- _merge_content_zones_across_boxes: zone merging logic
- _detect_heading_rows_by_color: heading detection by color + height
- _filter_border_ghosts: single-char ghost detection
- _detect_header_rows: skip_first_row_header flag
"""

import sys
sys.path.insert(0, '/app')

import pytest
from cv_vocab_types import PageZone, DetectedBox
from grid_editor_api import (
    _merge_content_zones_across_boxes,
    _filter_border_ghosts,
    _detect_header_rows,
    _detect_heading_rows_by_color,
)
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell


# ---------------------------------------------------------------------------
# _merge_content_zones_across_boxes
# ---------------------------------------------------------------------------

class TestMergeContentZonesAcrossBoxes:
    """Test zone merging across box zones."""

    def test_no_merge_when_less_than_3_zones(self):
        """Fewer than 3 zones → no merge possible."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 2
        assert result[0].zone_type == "content"
        assert result[1].zone_type == "box"

    def test_merge_content_box_content(self):
        """[content, box, content] → [merged_content with overlay]."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
            PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 1
        merged = result[0]
        assert merged.zone_type == "content"
        assert merged.y == 0
        assert merged.height == 350  # 0 to 350
        assert len(merged.image_overlays) == 1
        assert merged.image_overlays[0]["y"] == 100
        assert merged.image_overlays[0]["height"] == 50

    def test_box_at_start_not_merged(self):
        """Box at the start (not between contents) stays separate."""
        zones = [
            PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)),
            PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500),
            PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)),
            PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        # Box at start stays, then content+box+content merges
        assert len(result) == 2
        assert result[0].zone_type == "box"
        assert result[1].zone_type == "content"
        assert len(result[1].image_overlays) == 1

    def test_consecutive_boxes_not_merged(self):
        """[content, box, box, content] → no merge (consecutive boxes rare in practice)."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
            PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380,
                     box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)),
            PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        # Two consecutive boxes: the algorithm only merges [content, box, content]
        # pairs, so consecutive boxes break the pattern.
        assert len(result) == 4

    def test_zone_reindexing(self):
        """Zone indices are re-numbered after merging."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
            PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert result[0].index == 0

    def test_no_boxes_passthrough(self):
        """All-content zones pass through unchanged."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 2

    def test_typical_vocab_page_pattern(self):
        """Typical pattern: [box(VOCABULARY), content, box(image), content]
        → box stays, content+box+content merges."""
        zones = [
            PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400,
                     box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)),
            PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500),
            PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400,
                     box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)),
            PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 2
        assert result[0].zone_type == "box"  # VOCABULARY header box stays
        assert result[1].zone_type == "content"  # merged content zone
        assert result[1].y == 60
        assert result[1].height == 710 - 60  # 60 to 710
        assert len(result[1].image_overlays) == 1
        assert result[1].image_overlays[0]["y"] == 120
        # Check reindexing
        assert result[0].index == 0
        assert result[1].index == 1


# ---------------------------------------------------------------------------
# _detect_heading_rows_by_color
# ---------------------------------------------------------------------------

class TestDetectHeadingRowsByColor:
    """Test heading detection by color + height."""

    def _make_word_box(self, text, left, top, width, height, color="black"):
        return {
            "text": text,
            "left": left,
            "top": top,
            "width": width,
            "height": height,
            "color_name": color,
            "conf": 90,
        }

    def _make_zone(self, cells, rows, columns, zone_index=0,
                   bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000):
        return {
            "zone_index": zone_index,
            "zone_type": "content",
            "bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h},
            "cells": cells,
            "rows": rows,
            "columns": columns,
        }

    def test_blue_heading_detected(self):
        """Row with all blue words + taller height → heading."""
        # Normal rows: height ~20
        normal_cells = []
        for ri in range(5):
            normal_cells.append({
                "cell_id": f"Z0_R{ri:02d}_C0",
                "zone_index": 0,
                "row_index": ri,
                "col_index": 0,
                "col_type": "column_1",
                "text": f"word_{ri}",
                "word_boxes": [
                    self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20),
                ],
            })
            normal_cells.append({
                "cell_id": f"Z0_R{ri:02d}_C1",
                "zone_index": 0,
                "row_index": ri,
                "col_index": 1,
                "col_type": "column_2",
                "text": f"translation_{ri}",
                "word_boxes": [
                    self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20),
                ],
            })

        # Heading row (index 2): blue, taller (height 25)
        heading_ri = 2
        for c in normal_cells:
            if c["row_index"] == heading_ri:
                for wb in c["word_boxes"]:
                    wb["color_name"] = "blue"
                    wb["height"] = 25  # > 1.2 * 20 = 24

        rows = [
            {"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
            for ri in range(5)
        ]
        columns = [
            {"index": 0, "label": "column_1"},
            {"index": 1, "label": "column_2"},
        ]

        zones_data = [self._make_zone(normal_cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)

        assert count == 1
        # Check that row 2 is now marked as header
        assert rows[2]["is_header"] is True
        # Check that the heading cell was created
        heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri]
        assert len(heading_cells) == 1
        assert heading_cells[0]["col_type"] == "heading"
        assert "word_2" in heading_cells[0]["text"]
        assert "translation_2" in heading_cells[0]["text"]

    def test_black_row_not_heading(self):
        """Row with black words → not a heading, even if tall."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "hello",
                "word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")],
            },
            {
                "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
                "col_index": 1, "col_type": "column_2", "text": "world",
                "word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_mixed_color_row_not_heading(self):
        """Row with some blue and some black words → not a heading."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "Unit",
                "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
            },
            {
                "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
                "col_index": 1, "col_type": "column_2", "text": "normal",
                "word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_colored_but_not_tall_not_heading(self):
        """Row with all blue words but normal height → not a heading."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "Unit",
                "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")],
            },
            {
                "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
                "col_index": 1, "col_type": "column_2", "text": "four",
                "word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_single_column_zone_skipped(self):
        """Zones with < 2 columns are skipped."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "Unit",
                "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
            },
        ]
        rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}]
        columns = [{"index": 0, "label": "column_1"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_already_header_skipped(self):
        """Rows already marked is_header are not re-detected."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "spanning_header", "text": "Header",
                "word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0


# ---------------------------------------------------------------------------
# _filter_border_ghosts (Fix 2: single-char only)
# ---------------------------------------------------------------------------

class TestFilterBorderGhosts:
    """Test that ghost filtering only removes single-char words."""

    def test_single_char_ghost_removed(self):
        """Single '|' on a box border → filtered as ghost."""
        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
        words = [
            {"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
            {"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 1
        assert len(filtered) == 1
        assert filtered[0]["text"] == "hello"

    def test_multi_char_ghost_kept(self):
        """Multi-char '(=' on a bordered box → NOT filtered (real content)."""
        box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3)
        words = [
            {"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
            {"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 0
        assert len(filtered) == 2

    def test_borderless_box_no_ghost_filter(self):
        """Borderless box (border_thickness=0) → no ghost filtering at all."""
        box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
        words = [
            {"text": "I", "left": 643, "top": 272, "width": 6, "height": 19},  # near box edge
            {"text": "|", "left": 647, "top": 200, "width": 3, "height": 10},  # even pipe on edge
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 0  # nothing filtered — borderless box
        assert len(filtered) == 2

    def test_single_paren_on_border_removed(self):
        """Single ')' on border → filtered."""
        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
        words = [
            {"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 1
        assert len(filtered) == 0


# ---------------------------------------------------------------------------
# _detect_header_rows (Fix 3: skip_first_row_header)
# ---------------------------------------------------------------------------

class TestDetectHeaderRowsSkipFlag:
    """Test skip_first_row_header flag."""

    def test_first_row_detected_without_flag(self):
        """Without flag, first row with big gap → header."""
        rows = [
            {"y_min": 100, "y_max": 120, "index": 0},
            {"y_min": 160, "y_max": 180, "index": 1},
            {"y_min": 185, "y_max": 205, "index": 2},
        ]
        words = [
            {"height": 20, "top": 105, "left": 10, "width": 80},
            {"height": 20, "top": 165, "left": 10, "width": 80},
            {"height": 20, "top": 190, "left": 10, "width": 80},
        ]
        headers = _detect_header_rows(rows, words, 0)
        assert 0 in headers

    def test_first_row_skipped_with_flag(self):
        """With skip flag, first row NOT detected even with big gap."""
        rows = [
            {"y_min": 100, "y_max": 120, "index": 0},
            {"y_min": 160, "y_max": 180, "index": 1},
            {"y_min": 185, "y_max": 205, "index": 2},
        ]
        words = [
            {"height": 20, "top": 105, "left": 10, "width": 80},
            {"height": 20, "top": 165, "left": 10, "width": 80},
            {"height": 20, "top": 190, "left": 10, "width": 80},
        ]
        headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
        assert 0 not in headers


# ---------------------------------------------------------------------------
# _text_has_garbled_ipa + fix_ipa_continuation_cell
# ---------------------------------------------------------------------------

class TestGarbledIpaDetection:
    """Test detection and fixing of garbled IPA in bracket notation."""

    def test_bracket_garbled_no_ipa_chars(self):
        """'[n, nn]' — brackets with no real IPA chars → garbled."""
        assert _text_has_garbled_ipa("[n, nn]") is True

    def test_bracket_garbled_alphanumeric(self):
        """'[1uedtX,1]' — brackets with digits/letters → garbled."""
        assert _text_has_garbled_ipa("[1uedtX,1]") is True

    def test_bracket_valid_ipa_not_garbled(self):
        """'[ɪkwˈɪpmənt]' — brackets with real IPA → not garbled."""
        assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is False

    def test_no_brackets_normal_word(self):
        """'equipment' — normal word → not garbled."""
        assert _text_has_garbled_ipa("equipment") is False

    def test_fix_continuation_united_kingdom(self):
        """IPA continuation for 'the United Kingdom' → proper IPA."""
        fixed = fix_ipa_continuation_cell(
            "[n, nn]", "the United Kingdom", pronunciation="british",
        )
        # Should contain proper IPA, not the garbled text
        assert fixed != "[n, nn]"
        assert "kˈɪŋdəm" in fixed  # Kingdom IPA

    def test_fix_continuation_equipment(self):
        """IPA continuation for 'equipment' → proper IPA."""
        fixed = fix_ipa_continuation_cell(
            "[1uedtX,1]", "equipment (no pl)", pronunciation="british",
        )
        assert fixed != "[1uedtX,1]"
        assert "ɪkwˈɪpmənt" in fixed  # equipment IPA