""" Tests for grid_editor_api zone merging, heading detection, and ghost filtering. Covers: - _merge_content_zones_across_boxes: zone merging logic - _detect_heading_rows_by_color: heading detection by color + height - _filter_border_ghosts: single-char ghost detection - _detect_header_rows: skip_first_row_header flag """ import sys sys.path.insert(0, '/app') import pytest from cv_vocab_types import PageZone, DetectedBox from grid_editor_api import ( _merge_content_zones_across_boxes, _filter_border_ghosts, _detect_header_rows, _detect_heading_rows_by_color, ) from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell # --------------------------------------------------------------------------- # _merge_content_zones_across_boxes # --------------------------------------------------------------------------- class TestMergeContentZonesAcrossBoxes: """Test zone merging across box zones.""" def test_no_merge_when_less_than_3_zones(self): """Fewer than 3 zones → no merge possible.""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 2 assert result[0].zone_type == "content" assert result[1].zone_type == "box" def test_merge_content_box_content(self): """[content, box, content] → [merged_content with overlay].""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 1 merged = result[0] assert merged.zone_type == "content" assert merged.y == 0 assert merged.height == 350 # 0 to 350 assert len(merged.image_overlays) == 1 assert merged.image_overlays[0]["y"] == 100 assert merged.image_overlays[0]["height"] == 50 def test_box_at_start_not_merged(self): """Box at the start (not between contents) stays separate.""" zones = [ PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400, box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)), PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500), PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400, box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)), PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) # Box at start stays, then content+box+content merges assert len(result) == 2 assert result[0].zone_type == "box" assert result[1].zone_type == "content" assert len(result[1].image_overlays) == 1 def test_consecutive_boxes_not_merged(self): """[content, box, box, content] → no merge (consecutive boxes rare in practice).""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380, box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)), PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) # Two consecutive boxes: the algorithm only merges [content, box, content] # pairs, so consecutive boxes break the pattern. assert len(result) == 4 def test_zone_reindexing(self): """Zone indices are re-numbered after merging.""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert result[0].index == 0 def test_no_boxes_passthrough(self): """All-content zones pass through unchanged.""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 2 def test_typical_vocab_page_pattern(self): """Typical pattern: [box(VOCABULARY), content, box(image), content] → box stays, content+box+content merges.""" zones = [ PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400, box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)), PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500), PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400, box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)), PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 2 assert result[0].zone_type == "box" # VOCABULARY header box stays assert result[1].zone_type == "content" # merged content zone assert result[1].y == 60 assert result[1].height == 710 - 60 # 60 to 710 assert len(result[1].image_overlays) == 1 assert result[1].image_overlays[0]["y"] == 120 # Check reindexing assert result[0].index == 0 assert result[1].index == 1 # --------------------------------------------------------------------------- # _detect_heading_rows_by_color # --------------------------------------------------------------------------- class TestDetectHeadingRowsByColor: """Test heading detection by color + height.""" def _make_word_box(self, text, left, top, width, height, color="black"): return { "text": text, "left": left, "top": top, "width": width, "height": height, "color_name": color, "conf": 90, } def _make_zone(self, cells, rows, columns, zone_index=0, bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000): return { "zone_index": zone_index, "zone_type": "content", "bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h}, "cells": cells, "rows": rows, "columns": columns, } def test_blue_heading_detected(self): """Row with all blue words + taller height → heading.""" # Normal rows: height ~20 normal_cells = [] for ri in range(5): normal_cells.append({ "cell_id": f"Z0_R{ri:02d}_C0", "zone_index": 0, "row_index": ri, "col_index": 0, "col_type": "column_1", "text": f"word_{ri}", "word_boxes": [ self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20), ], }) normal_cells.append({ "cell_id": f"Z0_R{ri:02d}_C1", "zone_index": 0, "row_index": ri, "col_index": 1, "col_type": "column_2", "text": f"translation_{ri}", "word_boxes": [ self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20), ], }) # Heading row (index 2): blue, taller (height 25) heading_ri = 2 for c in normal_cells: if c["row_index"] == heading_ri: for wb in c["word_boxes"]: wb["color_name"] = "blue" wb["height"] = 25 # > 1.2 * 20 = 24 rows = [ {"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False} for ri in range(5) ] columns = [ {"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}, ] zones_data = [self._make_zone(normal_cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 1 # Check that row 2 is now marked as header assert rows[2]["is_header"] is True # Check that the heading cell was created heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri] assert len(heading_cells) == 1 assert heading_cells[0]["col_type"] == "heading" assert "word_2" in heading_cells[0]["text"] assert "translation_2" in heading_cells[0]["text"] def test_black_row_not_heading(self): """Row with black words → not a heading, even if tall.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "hello", "word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")], }, { "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0, "col_index": 1, "col_type": "column_2", "text": "world", "word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_mixed_color_row_not_heading(self): """Row with some blue and some black words → not a heading.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "Unit", "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")], }, { "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0, "col_index": 1, "col_type": "column_2", "text": "normal", "word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_colored_but_not_tall_not_heading(self): """Row with all blue words but normal height → not a heading.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "Unit", "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")], }, { "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0, "col_index": 1, "col_type": "column_2", "text": "four", "word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_single_column_zone_skipped(self): """Zones with < 2 columns are skipped.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "Unit", "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")], }, ] rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}] columns = [{"index": 0, "label": "column_1"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_already_header_skipped(self): """Rows already marked is_header are not re-detected.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "spanning_header", "text": "Header", "word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 # --------------------------------------------------------------------------- # _filter_border_ghosts (Fix 2: single-char only) # --------------------------------------------------------------------------- class TestFilterBorderGhosts: """Test that ghost filtering only removes single-char words.""" def test_single_char_ghost_removed(self): """Single '|' on a box border → filtered as ghost.""" box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3) words = [ {"text": "|", "left": 98, "top": 200, "width": 5, "height": 20}, {"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20}, ] filtered, count = _filter_border_ghosts(words, [box]) assert count == 1 assert len(filtered) == 1 assert filtered[0]["text"] == "hello" def test_multi_char_ghost_kept(self): """Multi-char '(=' on a bordered box → NOT filtered (real content).""" box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3) words = [ {"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17}, {"text": "I", "left": 665, "top": 294, "width": 9, "height": 18}, ] filtered, count = _filter_border_ghosts(words, [box]) assert count == 0 assert len(filtered) == 2 def test_borderless_box_no_ghost_filter(self): """Borderless box (border_thickness=0) → no ghost filtering at all.""" box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0) words = [ {"text": "I", "left": 643, "top": 272, "width": 6, "height": 19}, # near box edge {"text": "|", "left": 647, "top": 200, "width": 3, "height": 10}, # even pipe on edge ] filtered, count = _filter_border_ghosts(words, [box]) assert count == 0 # nothing filtered — borderless box assert len(filtered) == 2 def test_single_paren_on_border_removed(self): """Single ')' on border → filtered.""" box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2) words = [ {"text": ")", "left": 299, "top": 200, "width": 4, "height": 7}, ] filtered, count = _filter_border_ghosts(words, [box]) assert count == 1 assert len(filtered) == 0 # --------------------------------------------------------------------------- # _detect_header_rows (Fix 3: skip_first_row_header) # --------------------------------------------------------------------------- class TestDetectHeaderRowsSkipFlag: """Test skip_first_row_header flag.""" def test_first_row_detected_without_flag(self): """Without flag, first row with big gap → header.""" rows = [ {"y_min": 100, "y_max": 120, "index": 0}, {"y_min": 160, "y_max": 180, "index": 1}, {"y_min": 185, "y_max": 205, "index": 2}, ] words = [ {"height": 20, "top": 105, "left": 10, "width": 80}, {"height": 20, "top": 165, "left": 10, "width": 80}, {"height": 20, "top": 190, "left": 10, "width": 80}, ] headers = _detect_header_rows(rows, words, 0) assert 0 in headers def test_first_row_skipped_with_flag(self): """With skip flag, first row NOT detected even with big gap.""" rows = [ {"y_min": 100, "y_max": 120, "index": 0}, {"y_min": 160, "y_max": 180, "index": 1}, {"y_min": 185, "y_max": 205, "index": 2}, ] words = [ {"height": 20, "top": 105, "left": 10, "width": 80}, {"height": 20, "top": 165, "left": 10, "width": 80}, {"height": 20, "top": 190, "left": 10, "width": 80}, ] headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True) assert 0 not in headers # --------------------------------------------------------------------------- # _text_has_garbled_ipa + fix_ipa_continuation_cell # --------------------------------------------------------------------------- class TestGarbledIpaDetection: """Test detection and fixing of garbled IPA in bracket notation.""" def test_bracket_garbled_no_ipa_chars(self): """'[n, nn]' — brackets with no real IPA chars → garbled.""" assert _text_has_garbled_ipa("[n, nn]") is True def test_bracket_garbled_alphanumeric(self): """'[1uedtX,1]' — brackets with digits/letters → garbled.""" assert _text_has_garbled_ipa("[1uedtX,1]") is True def test_bracket_valid_ipa_detected(self): """'[ɪkwˈɪpmənt]' — brackets with real IPA → detected (has IPA chars). Note: _text_has_garbled_ipa detects IPA-like fragments in text. Valid IPA also triggers it; callers use a separate check (re.search for proper IPA brackets) to skip already-correct IPA. """ assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is True def test_no_brackets_normal_word(self): """'equipment' — normal word → not garbled.""" assert _text_has_garbled_ipa("equipment") is False def test_fix_continuation_united_kingdom(self): """IPA continuation for 'the United Kingdom' → proper IPA.""" fixed = fix_ipa_continuation_cell( "[n, nn]", "the United Kingdom", pronunciation="british", ) # Should contain proper IPA, not the garbled text assert fixed != "[n, nn]" assert "kˈɪŋdəm" in fixed # Kingdom IPA def test_fix_continuation_equipment(self): """IPA continuation for 'equipment' → proper IPA.""" fixed = fix_ipa_continuation_cell( "[1uedtX,1]", "equipment (no pl)", pronunciation="british", ) assert fixed != "[1uedtX,1]" assert "ɪkwˈɪpmənt" in fixed # equipment IPA def test_fix_continuation_close_down(self): """IPA continuation for 'close sth. down' → IPA for both words.""" fixed = fix_ipa_continuation_cell( "[klaoz 'daun]", "close sth. down", pronunciation="british", ) assert fixed != "[klaoz 'daun]" assert "klˈəʊs" in fixed # close IPA assert "dˈaʊn" in fixed # down IPA — must NOT be skipped def test_headword_with_brackets_not_continuation(self): """'employee [im'ploi:]' has a headword outside brackets → not garbled. _text_has_garbled_ipa returns True (has ':'), but Step 5d should skip this cell because text doesn't start with '['. """ # The garbled check still triggers (has IPA-like ':') assert _text_has_garbled_ipa("employee [im'ploi:]") is True # But text does NOT start with '[' — Step 5d bracket guard blocks it text = "employee [im'ploi:]" assert not (text.strip().startswith('[') and text.strip().endswith(']'))