breakpilot-lehrer/klausur-service/backend/tests/test_grid_editor_api.py

"""
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.

Covers:
- _merge_content_zones_across_boxes: zone merging logic
- _detect_heading_rows_by_color: heading detection by color + height
- _filter_border_ghosts: single-char ghost detection
- _detect_header_rows: skip_first_row_header flag
"""

import sys
sys.path.insert(0, '/app')

import cv2
import numpy as np
import pytest
from cv_vocab_types import PageZone, DetectedBox
from grid_editor_api import (
    _merge_content_zones_across_boxes,
    _filter_border_ghosts,
    _detect_header_rows,
    _detect_heading_rows_by_color,
    _detect_heading_rows_by_single_cell,
)
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell


# ---------------------------------------------------------------------------
# _merge_content_zones_across_boxes
# ---------------------------------------------------------------------------

class TestMergeContentZonesAcrossBoxes:
    """Test zone merging across box zones."""

    def test_no_merge_when_less_than_3_zones(self):
        """Fewer than 3 zones → no merge possible."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 2
        assert result[0].zone_type == "content"
        assert result[1].zone_type == "box"

    def test_merge_content_box_content(self):
        """[content, box, content] → [merged_content with overlay]."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
            PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 1
        merged = result[0]
        assert merged.zone_type == "content"
        assert merged.y == 0
        assert merged.height == 350  # 0 to 350
        assert len(merged.image_overlays) == 1
        assert merged.image_overlays[0]["y"] == 100
        assert merged.image_overlays[0]["height"] == 50

    def test_box_at_start_not_merged(self):
        """Box at the start (not between contents) stays separate."""
        zones = [
            PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)),
            PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500),
            PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)),
            PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        # Box at start stays, then content+box+content merges
        assert len(result) == 2
        assert result[0].zone_type == "box"
        assert result[1].zone_type == "content"
        assert len(result[1].image_overlays) == 1

    def test_consecutive_boxes_not_merged(self):
        """[content, box, box, content] → no merge (consecutive boxes rare in practice)."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
            PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380,
                     box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)),
            PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        # Two consecutive boxes: the algorithm only merges [content, box, content]
        # pairs, so consecutive boxes break the pattern.
        assert len(result) == 4

    def test_zone_reindexing(self):
        """Zone indices are re-numbered after merging."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
                     box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
            PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert result[0].index == 0

    def test_no_boxes_passthrough(self):
        """All-content zones pass through unchanged."""
        zones = [
            PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
            PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 2

    def test_typical_vocab_page_pattern(self):
        """Typical pattern: [box(VOCABULARY), content, box(image), content]
        → box stays, content+box+content merges."""
        zones = [
            PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400,
                     box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)),
            PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500),
            PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400,
                     box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)),
            PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500),
        ]
        result = _merge_content_zones_across_boxes(zones, 0, 500)
        assert len(result) == 2
        assert result[0].zone_type == "box"  # VOCABULARY header box stays
        assert result[1].zone_type == "content"  # merged content zone
        assert result[1].y == 60
        assert result[1].height == 710 - 60  # 60 to 710
        assert len(result[1].image_overlays) == 1
        assert result[1].image_overlays[0]["y"] == 120
        # Check reindexing
        assert result[0].index == 0
        assert result[1].index == 1


# ---------------------------------------------------------------------------
# _detect_heading_rows_by_color
# ---------------------------------------------------------------------------

class TestDetectHeadingRowsByColor:
    """Test heading detection by color + height."""

    def _make_word_box(self, text, left, top, width, height, color="black"):
        return {
            "text": text,
            "left": left,
            "top": top,
            "width": width,
            "height": height,
            "color_name": color,
            "conf": 90,
        }

    def _make_zone(self, cells, rows, columns, zone_index=0,
                   bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000):
        return {
            "zone_index": zone_index,
            "zone_type": "content",
            "bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h},
            "cells": cells,
            "rows": rows,
            "columns": columns,
        }

    def test_blue_heading_detected(self):
        """Row with all blue words + taller height → heading."""
        # Normal rows: height ~20
        normal_cells = []
        for ri in range(5):
            normal_cells.append({
                "cell_id": f"Z0_R{ri:02d}_C0",
                "zone_index": 0,
                "row_index": ri,
                "col_index": 0,
                "col_type": "column_1",
                "text": f"word_{ri}",
                "word_boxes": [
                    self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20),
                ],
            })
            normal_cells.append({
                "cell_id": f"Z0_R{ri:02d}_C1",
                "zone_index": 0,
                "row_index": ri,
                "col_index": 1,
                "col_type": "column_2",
                "text": f"translation_{ri}",
                "word_boxes": [
                    self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20),
                ],
            })

        # Heading row (index 2): blue, taller (height 25)
        heading_ri = 2
        for c in normal_cells:
            if c["row_index"] == heading_ri:
                for wb in c["word_boxes"]:
                    wb["color_name"] = "blue"
                    wb["height"] = 25  # > 1.2 * 20 = 24

        rows = [
            {"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
            for ri in range(5)
        ]
        columns = [
            {"index": 0, "label": "column_1"},
            {"index": 1, "label": "column_2"},
        ]

        zones_data = [self._make_zone(normal_cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)

        assert count == 1
        # Check that row 2 is now marked as header
        assert rows[2]["is_header"] is True
        # Check that the heading cell was created
        heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri]
        assert len(heading_cells) == 1
        assert heading_cells[0]["col_type"] == "heading"
        assert "word_2" in heading_cells[0]["text"]
        assert "translation_2" in heading_cells[0]["text"]

    def test_black_row_not_heading(self):
        """Row with black words → not a heading, even if tall."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "hello",
                "word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")],
            },
            {
                "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
                "col_index": 1, "col_type": "column_2", "text": "world",
                "word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_mixed_color_row_not_heading(self):
        """Row with some blue and some black words → not a heading."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "Unit",
                "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
            },
            {
                "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
                "col_index": 1, "col_type": "column_2", "text": "normal",
                "word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_colored_but_not_tall_not_heading(self):
        """Row with all blue words but normal height → not a heading."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "Unit",
                "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")],
            },
            {
                "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
                "col_index": 1, "col_type": "column_2", "text": "four",
                "word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_single_column_zone_skipped(self):
        """Zones with < 2 columns are skipped."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "column_1", "text": "Unit",
                "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
            },
        ]
        rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}]
        columns = [{"index": 0, "label": "column_1"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0

    def test_already_header_skipped(self):
        """Rows already marked is_header are not re-detected."""
        cells = [
            {
                "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
                "col_index": 0, "col_type": "spanning_header", "text": "Header",
                "word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")],
            },
            {
                "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
                "col_index": 0, "col_type": "column_1", "text": "foo",
                "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
            },
            {
                "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
                "col_index": 1, "col_type": "column_2", "text": "bar",
                "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
            },
        ]
        rows = [
            {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True},
            {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
        ]
        columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0


# ---------------------------------------------------------------------------
# _filter_border_ghosts (Fix 2: single-char only)
# ---------------------------------------------------------------------------

class TestFilterBorderGhosts:
    """Test that ghost filtering only removes single-char words."""

    def test_single_char_ghost_removed(self):
        """Single '|' on a box border → filtered as ghost."""
        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
        words = [
            {"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
            {"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 1
        assert len(filtered) == 1
        assert filtered[0]["text"] == "hello"

    def test_multi_char_ghost_kept(self):
        """Multi-char '(=' on a bordered box → NOT filtered (real content)."""
        box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3)
        words = [
            {"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
            {"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 0
        assert len(filtered) == 2

    def test_borderless_box_no_ghost_filter(self):
        """Borderless box (border_thickness=0) → no ghost filtering at all."""
        box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
        words = [
            {"text": "I", "left": 643, "top": 272, "width": 6, "height": 19},  # near box edge
            {"text": "|", "left": 647, "top": 200, "width": 3, "height": 10},  # even pipe on edge
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 0  # nothing filtered — borderless box
        assert len(filtered) == 2

    def test_single_paren_on_border_removed(self):
        """Single ')' on border → filtered."""
        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
        words = [
            {"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 1
        assert len(filtered) == 0


# ---------------------------------------------------------------------------
# Step 4d: Pipe-character divider filter
# ---------------------------------------------------------------------------

class TestPipeDividerFilter:
    """Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""

    def test_pipe_word_boxes_removed(self):
        """Word boxes with text '|' or '||' are removed from cells."""
        zone = {
            "zone_index": 0,
            "cells": [
                {
                    "cell_id": "Z0_R0_C0",
                    "text": "hello | world",
                    "word_boxes": [
                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
                        {"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
                        {"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
                    ],
                },
            ],
            "rows": [{"index": 0}],
        }
        # Simulate Step 4d inline
        import re
        _PIPE_RE = re.compile(r"^\|+$")
        for cell in zone["cells"]:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                cell["word_boxes"] = filtered
                cell["text"] = " ".join(
                    wb.get("text", "").strip()
                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
                    if wb.get("text", "").strip()
                )
        assert len(zone["cells"][0]["word_boxes"]) == 2
        assert zone["cells"][0]["text"] == "hello world"

    def test_pipe_only_cell_removed(self):
        """A cell containing only '|' word_boxes becomes empty and is removed."""
        zone = {
            "zone_index": 0,
            "cells": [
                {
                    "cell_id": "Z0_R0_C0",
                    "text": "hello",
                    "word_boxes": [
                        {"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
                    ],
                },
                {
                    "cell_id": "Z0_R0_C1",
                    "text": "|",
                    "word_boxes": [
                        {"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
                    ],
                },
            ],
            "rows": [{"index": 0}],
        }
        import re
        _PIPE_RE = re.compile(r"^\|+$")
        removed = 0
        for cell in zone["cells"]:
            wbs = cell.get("word_boxes") or []
            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
            if len(filtered) < len(wbs):
                removed += len(wbs) - len(filtered)
                cell["word_boxes"] = filtered
                cell["text"] = " ".join(
                    wb.get("text", "").strip()
                    for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
                    if wb.get("text", "").strip()
                )
        if removed:
            zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
        assert removed == 1
        assert len(zone["cells"]) == 1
        assert zone["cells"][0]["text"] == "hello"

    def test_double_pipe_removed(self):
        """'||' is also treated as a divider artifact."""
        import re
        _PIPE_RE = re.compile(r"^\|+$")
        assert _PIPE_RE.match("||") is not None
        assert _PIPE_RE.match("|") is not None
        assert _PIPE_RE.match("hello") is None
        assert _PIPE_RE.match("|word") is None


# ---------------------------------------------------------------------------
# _detect_header_rows (Fix 3: skip_first_row_header)
# ---------------------------------------------------------------------------

class TestDetectHeaderRowsSkipFlag:
    """Test skip_first_row_header flag."""

    def test_first_row_detected_without_flag(self):
        """Without flag, first row with big gap → header."""
        rows = [
            {"y_min": 100, "y_max": 120, "index": 0},
            {"y_min": 160, "y_max": 180, "index": 1},
            {"y_min": 185, "y_max": 205, "index": 2},
        ]
        words = [
            {"height": 20, "top": 105, "left": 10, "width": 80},
            {"height": 20, "top": 165, "left": 10, "width": 80},
            {"height": 20, "top": 190, "left": 10, "width": 80},
        ]
        headers = _detect_header_rows(rows, words, 0)
        assert 0 in headers

    def test_first_row_skipped_with_flag(self):
        """With skip flag, first row NOT detected even with big gap."""
        rows = [
            {"y_min": 100, "y_max": 120, "index": 0},
            {"y_min": 160, "y_max": 180, "index": 1},
            {"y_min": 185, "y_max": 205, "index": 2},
        ]
        words = [
            {"height": 20, "top": 105, "left": 10, "width": 80},
            {"height": 20, "top": 165, "left": 10, "width": 80},
            {"height": 20, "top": 190, "left": 10, "width": 80},
        ]
        headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
        assert 0 not in headers


# ---------------------------------------------------------------------------
# _text_has_garbled_ipa + fix_ipa_continuation_cell
# ---------------------------------------------------------------------------

class TestGarbledIpaDetection:
    """Test detection and fixing of garbled IPA in bracket notation."""

    def test_bracket_garbled_no_ipa_chars(self):
        """'[n, nn]' — brackets with no real IPA chars → garbled."""
        assert _text_has_garbled_ipa("[n, nn]") is True

    def test_bracket_garbled_alphanumeric(self):
        """'[1uedtX,1]' — brackets with digits/letters → garbled."""
        assert _text_has_garbled_ipa("[1uedtX,1]") is True

    def test_bracket_valid_ipa_detected(self):
        """'[ɪkwˈɪpmənt]' — brackets with real IPA → detected (has IPA chars).

        Note: _text_has_garbled_ipa detects IPA-like fragments in text.
        Valid IPA also triggers it; callers use a separate check
        (re.search for proper IPA brackets) to skip already-correct IPA.
        """
        assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is True

    def test_no_brackets_normal_word(self):
        """'equipment' — normal word → not garbled."""
        assert _text_has_garbled_ipa("equipment") is False

    def test_fix_continuation_united_kingdom(self):
        """IPA continuation for 'the United Kingdom' → IPA without 'the'."""
        fixed = fix_ipa_continuation_cell(
            "[n, nn]", "the United Kingdom", pronunciation="british",
        )
        # Should contain proper IPA, not the garbled text
        assert fixed != "[n, nn]"
        assert "kˈɪŋdəm" in fixed  # Kingdom IPA
        assert "ðə" not in fixed    # "the" must NOT get IPA

    def test_fix_continuation_equipment(self):
        """IPA continuation for 'equipment' → proper IPA."""
        fixed = fix_ipa_continuation_cell(
            "[1uedtX,1]", "equipment (no pl)", pronunciation="british",
        )
        assert fixed != "[1uedtX,1]"
        assert "ɪkwˈɪpmənt" in fixed  # equipment IPA

    def test_fix_continuation_close_down(self):
        """IPA continuation for 'close sth. down' → IPA for both words."""
        fixed = fix_ipa_continuation_cell(
            "[klaoz 'daun]", "close sth. down", pronunciation="british",
        )
        assert fixed != "[klaoz 'daun]"
        assert "klˈəʊs" in fixed   # close IPA
        assert "dˈaʊn" in fixed    # down IPA — must NOT be skipped

    def test_continuation_skips_words_with_inline_ipa(self):
        """'beat [bˈiːt] , beat, beaten' → continuation only for 'beaten'."""
        fixed = fix_ipa_continuation_cell(
            "[bi:tan]", "beat [bˈiːt] , beat, beaten", pronunciation="british",
        )
        # Should only have IPA for "beaten", NOT for "beat" (already inline)
        assert "bˈiːtən" in fixed
        assert fixed.count("bˈiːt") == 0 or fixed == "[bˈiːtən]"

    def test_continuation_bracket_at_end_returns_inline(self):
        """'the Highlands [ˈhaɪləndz]' → return inline IPA, not IPA for 'the'."""
        fixed = fix_ipa_continuation_cell(
            "'hailandz", "the Highlands [ˈhaɪləndz]", pronunciation="british",
        )
        assert fixed == "[ˈhaɪləndz]"
        assert "ðə" not in fixed  # "the" must NOT get IPA

    def test_headword_with_brackets_not_continuation(self):
        """'employee [im'ploi:]' has a headword outside brackets → not garbled.

        _text_has_garbled_ipa returns True (has ':'), but Step 5d should
        skip this cell because text doesn't start with '['.
        """
        # The garbled check still triggers (has IPA-like ':')
        assert _text_has_garbled_ipa("employee [im'ploi:]") is True
        # But text does NOT start with '[' — Step 5d bracket guard blocks it
        text = "employee [im'ploi:]"
        assert not (text.strip().startswith('[') and text.strip().endswith(']'))


# ---------------------------------------------------------------------------
# _detect_heading_rows_by_single_cell
# ---------------------------------------------------------------------------

class TestDetectHeadingRowsBySingleCell:
    """Test heading detection for black single-cell rows (e.g. 'Theme')."""

    def _make_word_box(self, text, left, top, width, height, color="black"):
        return {
            "text": text, "left": left, "top": top,
            "width": width, "height": height, "color_name": color, "conf": 90,
        }

    def _make_vocab_zone(self):
        """Build a typical 4-column vocab zone with 8 rows.

        Columns: column_1 (page_ref), column_2 (EN), column_3 (DE), column_4 (example)
        Row 4 has only 1 cell in column_2 → heading candidate ("Theme").
        """
        cells = []
        for ri in range(8):
            if ri == 4:
                # Single-cell row: "Theme" in column_2 only
                cells.append({
                    "cell_id": f"Z0_R{ri:02d}_C1",
                    "zone_index": 0, "row_index": ri, "col_index": 1,
                    "col_type": "column_2", "text": "Theme",
                    "word_boxes": [self._make_word_box("Theme", 130, 100 + ri * 30, 70, 20)],
                })
                continue
            # Normal vocab row: 3-4 cells
            cells.append({
                "cell_id": f"Z0_R{ri:02d}_C0",
                "zone_index": 0, "row_index": ri, "col_index": 0,
                "col_type": "column_1", "text": f"p.{70 + ri}",
                "word_boxes": [self._make_word_box(f"p.{70+ri}", 10, 100 + ri * 30, 30, 20)],
            })
            cells.append({
                "cell_id": f"Z0_R{ri:02d}_C1",
                "zone_index": 0, "row_index": ri, "col_index": 1,
                "col_type": "column_2", "text": f"word_{ri}",
                "word_boxes": [self._make_word_box(f"word_{ri}", 130, 100 + ri * 30, 80, 20)],
            })
            cells.append({
                "cell_id": f"Z0_R{ri:02d}_C2",
                "zone_index": 0, "row_index": ri, "col_index": 2,
                "col_type": "column_3", "text": f"Wort_{ri}",
                "word_boxes": [self._make_word_box(f"Wort_{ri}", 400, 100 + ri * 30, 80, 20)],
            })
            cells.append({
                "cell_id": f"Z0_R{ri:02d}_C3",
                "zone_index": 0, "row_index": ri, "col_index": 3,
                "col_type": "column_4", "text": f"Example sentence {ri}.",
                "word_boxes": [self._make_word_box(f"Example", 600, 100 + ri * 30, 120, 20)],
            })

        rows = [
            {"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
            for ri in range(8)
        ]
        columns = [
            {"col_index": 0, "col_type": "column_1"},
            {"col_index": 1, "col_type": "column_2"},
            {"col_index": 2, "col_type": "column_3"},
            {"col_index": 3, "col_type": "column_4"},
        ]
        return {
            "zone_index": 0, "zone_type": "content",
            "bbox_px": {"x": 0, "y": 0, "w": 800, "h": 1000},
            "cells": cells, "rows": rows, "columns": columns,
        }

    def test_single_cell_heading_detected(self):
        """Row with only 1 content cell in column_2 → heading."""
        zone = self._make_vocab_zone()
        zones_data = [zone]
        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
        assert count == 1
        heading_cells = [c for c in zone["cells"] if c["row_index"] == 4]
        assert len(heading_cells) == 1
        assert heading_cells[0]["col_type"] == "heading"
        assert heading_cells[0]["text"] == "Theme"
        assert heading_cells[0]["col_index"] == 1  # Starts at column_2, not 0

    def test_single_cell_in_last_column_not_heading(self):
        """Row with only 1 cell in column_4 (last) → NOT heading (continuation)."""
        zone = self._make_vocab_zone()
        # Add a single-cell row in the last column (column_4)
        zone["cells"].append({
            "cell_id": "Z0_R04_C3",
            "zone_index": 0, "row_index": 4, "col_index": 3,
            "col_type": "column_4", "text": "2. Veränderung",
            "word_boxes": [self._make_word_box("2.", 600, 220, 100, 20)],
        })
        # Remove the "Theme" cell from row 4
        zone["cells"] = [c for c in zone["cells"]
                         if not (c["row_index"] == 4 and c["col_index"] == 1)]
        zones_data = [zone]
        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
        # Row 4 now only has column_4 → should NOT be heading
        # But original row 4 "Theme" was removed, so no heading at all
        assert count == 0

    def test_ipa_bracket_text_not_heading(self):
        """Row with single cell starting with '[' → IPA continuation, not heading."""
        zone = self._make_vocab_zone()
        # Replace "Theme" with IPA continuation
        for c in zone["cells"]:
            if c["row_index"] == 4 and c["col_index"] == 1:
                c["text"] = "[θˈiːm]"
                break
        zones_data = [zone]
        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
        assert count == 0

    def test_multi_cell_row_not_heading(self):
        """Normal vocab row with multiple cells → NOT heading."""
        zone = self._make_vocab_zone()
        zones_data = [zone]
        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
        # Only row 4 (Theme) should be heading, other rows have 3-4 cells
        assert count == 1
        # Verify normal rows are NOT marked as heading
        for ri in [0, 1, 2, 3, 5, 6, 7]:
            row_cells = [c for c in zone["cells"] if c["row_index"] == ri]
            for c in row_cells:
                assert c["col_type"] != "heading"

    def test_color_heading_preserves_correct_col_index(self):
        """Color heading starting in column_2 → col_index should be 1, not 0."""
        zone = self._make_vocab_zone()
        # Make row 3 a color heading: blue words in column_2 and column_3 only
        # (no column_1 page_ref for this row)
        zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 3]
        zone["cells"].append({
            "cell_id": "Z0_R03_C1", "zone_index": 0, "row_index": 3,
            "col_index": 1, "col_type": "column_2", "text": "Unit 4:",
            "word_boxes": [self._make_word_box("Unit", 130, 190, 50, 26, "blue"),
                           self._make_word_box("4:", 185, 190, 20, 26, "blue")],
        })
        zone["cells"].append({
            "cell_id": "Z0_R03_C2", "zone_index": 0, "row_index": 3,
            "col_index": 2, "col_type": "column_3", "text": "Scotland",
            "word_boxes": [self._make_word_box("Scotland", 400, 190, 100, 26, "blue")],
        })
        zones_data = [zone]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 1
        heading = [c for c in zone["cells"] if c["row_index"] == 3]
        assert len(heading) == 1
        assert heading[0]["col_type"] == "heading"
        assert heading[0]["col_index"] == 1  # Should start at column_2, not 0

    def test_last_row_single_cell_not_heading(self):
        """Single-cell in last row (e.g. page number '212') → NOT heading."""
        zone = self._make_vocab_zone()
        # Make row 7 (the last) have only 1 cell in column_2
        zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
        zone["cells"].append({
            "cell_id": "Z0_R07_C1",
            "zone_index": 0, "row_index": 7, "col_index": 1,
            "col_type": "column_2", "text": "two hundred and twelve",
            "word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
        })
        zones_data = [zone]
        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
        # Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
        assert count == 1
        heading_cells = [c for c in zone["cells"]
                         if c.get("col_type") == "heading"]
        assert all(c["row_index"] != 7 for c in heading_cells)


# ---------------------------------------------------------------------------
# Step 5h: Slash-IPA to bracket conversion
# ---------------------------------------------------------------------------

class TestSlashIpaConversion:
    """Step 5h converts /ocr_ipa/ patterns to [dictionary_ipa] notation."""

    def _run_step_5h(self, text: str) -> str:
        """Run the Step 5h regex logic on a single text string."""
        import re
        from cv_ocr_engines import _lookup_ipa

        _SLASH_IPA_RE = re.compile(
            r'(\b[a-zA-Z]+[²³¹]?)\s*'
            r"(/[^/]{2,}/)"
        )
        _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
        _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')

        def _replace(m):
            headword = m.group(1)
            ocr_ipa = m.group(2)
            inner_raw = ocr_ipa.strip("/").strip()
            if _SLASH_IPA_REJECT_RE.search(inner_raw):
                return m.group(0)
            clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
            ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
            if ipa:
                return f"{headword} [{ipa}]"
            inner = inner_raw.lstrip("'").strip()
            if inner:
                return f"{headword} [{inner}]"
            return m.group(0)

        new_text = _SLASH_IPA_RE.sub(_replace, text)

        # Second pass: trailing /ipa/ after [ipa]
        _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
        def _replace_trailing(m):
            inner = m.group(1).strip("/").strip().lstrip("'").strip()
            if _SLASH_IPA_REJECT_RE.search(inner):
                return m.group(0)
            if inner:
                return f" [{inner}]"
            return m.group(0)
        new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing, new_text)

        if new_text == text:
            m = _STANDALONE_SLASH_IPA_RE.match(text)
            if m:
                inner = m.group(1).strip()
                if not _SLASH_IPA_REJECT_RE.search(inner):
                    inner = inner.lstrip("'").strip()
                    if inner:
                        new_text = "[" + inner + "]" + text[m.end():]
        return new_text

    def test_tiger_dict_lookup(self):
        """tiger /'taiga/ → tiger [tˈaɪgə] (from dictionary)."""
        result = self._run_step_5h("tiger /'taiga/ Nomen Tiger")
        assert "[tˈaɪgə]" in result
        assert "/'taiga/" not in result
        assert result.startswith("tiger")

    def test_tight_no_space(self):
        """tight²/tait/ → tight² [tˈaɪt] (no space before slash)."""
        result = self._run_step_5h("tight²/tait/ Adv fest")
        assert "[tˈaɪt]" in result
        assert "/tait/" not in result

    def test_unknown_word_falls_back_to_ocr(self):
        """tinned/und/ → tinned [und] (not in dictionary, keeps OCR IPA)."""
        result = self._run_step_5h("tinned/und/ Adj Dosen-")
        assert "[und]" in result
        assert "/und/" not in result

    def test_sb_sth_not_matched(self):
        """sb/sth should NOT be treated as IPA (contains space/parens)."""
        text = "(tie sb/sth up) jdn/etwas anbinden"
        result = self._run_step_5h(text)
        # The inner content "sth up) jdn" has spaces and parens → rejected
        assert result == text  # unchanged

    def test_double_ipa_both_converted(self):
        """times/taimz/ /tamz/ → times [tˈaɪmz] [tamz] (both converted)."""
        result = self._run_step_5h("times/taimz/ /tamz/ Präp")
        assert "[tˈaɪmz]" in result
        assert "[tamz]" in result
        assert "/taimz/" not in result
        assert "/tamz/" not in result

    def test_standalone_slash_ipa_at_start(self):
        """/tam/ Nomen → [tam] Nomen (no headword in cell)."""
        result = self._run_step_5h("/tam/ Nomen 1 Zeit")
        assert result.startswith("[tam]")
        assert "/tam/" not in result

    def test_no_slashes_unchanged(self):
        """Text without slashes passes through unchanged."""
        text = "hello world"
        assert self._run_step_5h(text) == text

    def test_tile_dict_lookup(self):
        """tile /tail/ → tile [tˈaɪl]."""
        result = self._run_step_5h("tile /tail/ Nomen Dachziegel")
        assert "[tˈaɪl]" in result


# ---------------------------------------------------------------------------
# Color detection: red false-positive suppression
# ---------------------------------------------------------------------------

class TestRedFalsePositiveSuppression:
    """Red requires median_sat >= 80 to avoid scanner artifact false positives."""

    def test_low_saturation_red_classified_as_black(self):
        """Black text with slight warm scanner tint (sat ~60) → black, not red."""
        import numpy as np
        from cv_color_detect import detect_word_colors

        # Create a 40x20 image with dark gray pixels (slight warm tint)
        # HSV: hue=5 (red range), sat=60 (above 55 threshold but below 80), val=40
        img_hsv = np.full((40, 200, 3), [5, 60, 40], dtype=np.uint8)
        img_bgr = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)

        wb = [{"left": 10, "top": 5, "width": 50, "height": 20, "text": "test"}]
        detect_word_colors(img_bgr, wb)
        assert wb[0]["color_name"] == "black", \
            f"Expected black, got {wb[0]['color_name']} (scanner artifact false positive)"

    def test_high_saturation_red_classified_as_red(self):
        """Genuinely red text (sat=150) → red."""
        import numpy as np
        from cv_color_detect import detect_word_colors

        # White background with red text region
        # Background: white (H=0, S=0, V=255)
        img_hsv = np.full((40, 200, 3), [0, 0, 255], dtype=np.uint8)
        # Text area: red (H=5, S=180, V=200)
        img_hsv[8:18, 15:55] = [5, 180, 200]
        img_bgr = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)

        wb = [{"left": 10, "top": 5, "width": 50, "height": 20, "text": "red"}]
        detect_word_colors(img_bgr, wb)
        assert wb[0]["color_name"] == "red", \
            f"Expected red, got {wb[0]['color_name']}"