"""Tests for unified_grid.py — merging multi-zone grids into single zone.""" import pytest import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from unified_grid import ( _compute_dominant_row_height, _classify_boxes, build_unified_grid, ) def _make_content_zone(rows_y, num_cols=4, bbox_w=1400): """Helper: create a content zone with rows at given y positions.""" rows = [{"index": i, "y_min_px": y, "y_max_px": y + 30, "y_min": y, "y_max": y + 30, "is_header": False} for i, y in enumerate(rows_y)] cols = [{"index": i, "x_min_px": i * (bbox_w // num_cols), "x_max_px": (i + 1) * (bbox_w // num_cols)} for i in range(num_cols)] cells = [{"row_index": r["index"], "col_index": c["index"], "col_type": f"column_{c['index']+1}", "text": f"R{r['index']}C{c['index']}", "cell_id": f"R{r['index']}C{c['index']}", "word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test", "bbox_px": {"x": 0, "y": 0, "w": 100, "h": 30}, "bbox_pct": {"x": 0, "y": 0, "w": 10, "h": 2}} for r in rows for c in cols] return { "zone_index": 1, "zone_type": "content", "bbox_px": {"x": 50, "y": rows_y[0] - 10, "w": bbox_w, "h": rows_y[-1] - rows_y[0] + 50}, "bbox_pct": {"x": 3, "y": 10, "w": 85, "h": 80}, "columns": cols, "rows": rows, "cells": cells, "header_rows": [], "border": None, "word_count": len(cells), } def _make_box_zone(zone_index, bbox, cells_data, bg_hex="#2563eb", layout_type="flowing"): """Helper: create a box zone.""" rows = [{"index": i, "y_min_px": bbox["y"] + i * 30, "y_max_px": bbox["y"] + (i + 1) * 30, "is_header": i == 0} for i in range(len(cells_data))] cols = [{"index": 0, "x_min_px": bbox["x"], "x_max_px": bbox["x"] + bbox["w"]}] cells = [{"row_index": i, "col_index": 0, "col_type": "column_1", "text": text, "cell_id": f"Z{zone_index}_R{i}C0", "word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test", "bbox_px": {"x": bbox["x"], "y": bbox["y"] + i * 30, "w": bbox["w"], "h": 30}, "bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10}} for i, text in enumerate(cells_data)] return { "zone_index": zone_index, "zone_type": "box", "bbox_px": bbox, "bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10}, "columns": cols, "rows": rows, "cells": cells, "header_rows": [0], "border": None, "word_count": len(cells), "box_bg_hex": bg_hex, "box_bg_color": "blue", "box_layout_type": layout_type, } class TestDominantRowHeight: def test_regular_spacing(self): """Rows with uniform spacing → median = that spacing.""" zone = _make_content_zone([100, 147, 194, 241, 288]) h = _compute_dominant_row_height(zone) assert h == 47 def test_filters_large_gaps(self): """Large gaps (box interruptions) are filtered out.""" zone = _make_content_zone([100, 147, 194, 600, 647, 694]) # spacings: 47, 47, 406(!), 47, 47 → filter >100 → median of [47,47,47,47] = 47 h = _compute_dominant_row_height(zone) assert h == 47 def test_single_row(self): """Single row → default 47.""" zone = _make_content_zone([100]) h = _compute_dominant_row_height(zone) assert h == 47.0 class TestClassifyBoxes: def test_full_width(self): """Box wider than 85% of content → full_width.""" boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 1300, "h": 200}, ["Header", "Text"])] result = _classify_boxes(boxes, content_width=1400) assert result[0]["classification"] == "full_width" def test_partial_width_right(self): """Narrow box on right side → partial_width, side=right.""" boxes = [_make_box_zone(2, {"x": 800, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])] result = _classify_boxes(boxes, content_width=1400) assert result[0]["classification"] == "partial_width" assert result[0]["side"] == "right" def test_partial_width_left(self): """Narrow box on left side → partial_width, side=left.""" boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])] result = _classify_boxes(boxes, content_width=1400) assert result[0]["classification"] == "partial_width" assert result[0]["side"] == "left" def test_text_line_count(self): """Total text lines counted including \\n.""" boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200}, ["Header", "Line1\nLine2\nLine3"])] result = _classify_boxes(boxes, content_width=1400) assert result[0]["total_lines"] == 4 # "Header" (1) + "Line1\nLine2\nLine3" (3) class TestBuildUnifiedGrid: def test_content_only(self): """Content zone without boxes → single unified zone.""" content = _make_content_zone([100, 147, 194, 241]) result = build_unified_grid([content], 1600, 2200, {}) assert result["is_unified"] is True assert len(result["zones"]) == 1 assert result["zones"][0]["zone_type"] == "unified" assert result["summary"]["total_rows"] == 4 def test_full_width_box_integration(self): """Full-width box rows are integrated into unified grid.""" content = _make_content_zone([100, 147, 194, 600, 647]) box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200}, ["Box Header", "Box Row 1", "Box Row 2"]) result = build_unified_grid([content, box], 1600, 2200, {}) assert result["is_unified"] is True total_rows = result["summary"]["total_rows"] # 5 content rows + 3 box rows = 8 assert total_rows == 8 def test_box_cells_tagged(self): """Box-origin cells have source_zone_type and box_region.""" content = _make_content_zone([100, 147, 600, 647]) box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200}, ["Box Text"]) result = build_unified_grid([content, box], 1600, 2200, {}) box_cells = [c for c in result["zones"][0]["cells"] if c.get("source_zone_type") == "box"] assert len(box_cells) > 0 assert box_cells[0]["box_region"]["bg_hex"] == "#2563eb" def test_no_content_zone(self): """No content zone → returns zones as-is.""" box = _make_box_zone(2, {"x": 50, "y": 300, "w": 500, "h": 200}, ["Text"]) result = build_unified_grid([box], 1600, 2200, {}) assert "zones" in result