""" Tests for grid_editor_api zone merging and heading detection. Covers: - _merge_content_zones_across_boxes: zone merging logic - _detect_heading_rows_by_color: heading detection by color + height """ import sys sys.path.insert(0, '/app') import pytest from cv_vocab_types import PageZone, DetectedBox from grid_editor_api import ( _merge_content_zones_across_boxes, _detect_heading_rows_by_color, ) # --------------------------------------------------------------------------- # _merge_content_zones_across_boxes # --------------------------------------------------------------------------- class TestMergeContentZonesAcrossBoxes: """Test zone merging across box zones.""" def test_no_merge_when_less_than_3_zones(self): """Fewer than 3 zones → no merge possible.""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 2 assert result[0].zone_type == "content" assert result[1].zone_type == "box" def test_merge_content_box_content(self): """[content, box, content] → [merged_content with overlay].""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 1 merged = result[0] assert merged.zone_type == "content" assert merged.y == 0 assert merged.height == 350 # 0 to 350 assert len(merged.image_overlays) == 1 assert merged.image_overlays[0]["y"] == 100 assert merged.image_overlays[0]["height"] == 50 def test_box_at_start_not_merged(self): """Box at the start (not between contents) stays separate.""" zones = [ PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400, box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)), PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500), PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400, box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)), PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) # Box at start stays, then content+box+content merges assert len(result) == 2 assert result[0].zone_type == "box" assert result[1].zone_type == "content" assert len(result[1].image_overlays) == 1 def test_consecutive_boxes_not_merged(self): """[content, box, box, content] → no merge (consecutive boxes rare in practice).""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380, box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)), PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) # Two consecutive boxes: the algorithm only merges [content, box, content] # pairs, so consecutive boxes break the pattern. assert len(result) == 4 def test_zone_reindexing(self): """Zone indices are re-numbered after merging.""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400, box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)), PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert result[0].index == 0 def test_no_boxes_passthrough(self): """All-content zones pass through unchanged.""" zones = [ PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500), PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 2 def test_typical_vocab_page_pattern(self): """Typical pattern: [box(VOCABULARY), content, box(image), content] → box stays, content+box+content merges.""" zones = [ PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400, box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)), PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500), PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400, box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)), PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500), ] result = _merge_content_zones_across_boxes(zones, 0, 500) assert len(result) == 2 assert result[0].zone_type == "box" # VOCABULARY header box stays assert result[1].zone_type == "content" # merged content zone assert result[1].y == 60 assert result[1].height == 710 - 60 # 60 to 710 assert len(result[1].image_overlays) == 1 assert result[1].image_overlays[0]["y"] == 120 # Check reindexing assert result[0].index == 0 assert result[1].index == 1 # --------------------------------------------------------------------------- # _detect_heading_rows_by_color # --------------------------------------------------------------------------- class TestDetectHeadingRowsByColor: """Test heading detection by color + height.""" def _make_word_box(self, text, left, top, width, height, color="black"): return { "text": text, "left": left, "top": top, "width": width, "height": height, "color_name": color, "conf": 90, } def _make_zone(self, cells, rows, columns, zone_index=0, bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000): return { "zone_index": zone_index, "zone_type": "content", "bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h}, "cells": cells, "rows": rows, "columns": columns, } def test_blue_heading_detected(self): """Row with all blue words + taller height → heading.""" # Normal rows: height ~20 normal_cells = [] for ri in range(5): normal_cells.append({ "cell_id": f"Z0_R{ri:02d}_C0", "zone_index": 0, "row_index": ri, "col_index": 0, "col_type": "column_1", "text": f"word_{ri}", "word_boxes": [ self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20), ], }) normal_cells.append({ "cell_id": f"Z0_R{ri:02d}_C1", "zone_index": 0, "row_index": ri, "col_index": 1, "col_type": "column_2", "text": f"translation_{ri}", "word_boxes": [ self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20), ], }) # Heading row (index 2): blue, taller (height 25) heading_ri = 2 for c in normal_cells: if c["row_index"] == heading_ri: for wb in c["word_boxes"]: wb["color_name"] = "blue" wb["height"] = 25 # > 1.2 * 20 = 24 rows = [ {"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False} for ri in range(5) ] columns = [ {"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}, ] zones_data = [self._make_zone(normal_cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 1 # Check that row 2 is now marked as header assert rows[2]["is_header"] is True # Check that the heading cell was created heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri] assert len(heading_cells) == 1 assert heading_cells[0]["col_type"] == "heading" assert "word_2" in heading_cells[0]["text"] assert "translation_2" in heading_cells[0]["text"] def test_black_row_not_heading(self): """Row with black words → not a heading, even if tall.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "hello", "word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")], }, { "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0, "col_index": 1, "col_type": "column_2", "text": "world", "word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_mixed_color_row_not_heading(self): """Row with some blue and some black words → not a heading.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "Unit", "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")], }, { "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0, "col_index": 1, "col_type": "column_2", "text": "normal", "word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_colored_but_not_tall_not_heading(self): """Row with all blue words but normal height → not a heading.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "Unit", "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")], }, { "cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0, "col_index": 1, "col_type": "column_2", "text": "four", "word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_single_column_zone_skipped(self): """Zones with < 2 columns are skipped.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "column_1", "text": "Unit", "word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")], }, ] rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}] columns = [{"index": 0, "label": "column_1"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 def test_already_header_skipped(self): """Rows already marked is_header are not re-detected.""" cells = [ { "cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0, "col_index": 0, "col_type": "spanning_header", "text": "Header", "word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")], }, { "cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1, "col_index": 0, "col_type": "column_1", "text": "foo", "word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")], }, { "cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1, "col_index": 1, "col_type": "column_2", "text": "bar", "word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")], }, ] rows = [ {"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True}, {"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False}, ] columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}] zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0