Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 50s
CI / test-go-edu-search (push) Successful in 45s
CI / test-python-klausur (push) Failing after 2m30s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 34s
test_unified_grid.py (10 tests): - Dominant row height calculation (regular, gaps filtered, single row) - Box classification (full-width, partial left/right, text line count) - Unified grid building (content-only, box integration, cell tagging) test_box_layout.py (13 tests): - Layout classification (header_only, flowing, bullet_list) - Line grouping by y-proximity - Flowing layout indent grouping (bullet + continuations → \n) - Row/column field completeness for GridTable compatibility Total: 66 tests passing (43 smart_spell + 13 box_layout + 10 unified) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
142 lines
6.6 KiB
Python
142 lines
6.6 KiB
Python
"""Tests for unified_grid.py — merging multi-zone grids into single zone."""
|
|
|
|
import pytest
|
|
import sys, os
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from unified_grid import (
|
|
_compute_dominant_row_height,
|
|
_classify_boxes,
|
|
build_unified_grid,
|
|
)
|
|
|
|
|
|
def _make_content_zone(rows_y, num_cols=4, bbox_w=1400):
|
|
"""Helper: create a content zone with rows at given y positions."""
|
|
rows = [{"index": i, "y_min_px": y, "y_max_px": y + 30, "y_min": y, "y_max": y + 30,
|
|
"is_header": False} for i, y in enumerate(rows_y)]
|
|
cols = [{"index": i, "x_min_px": i * (bbox_w // num_cols), "x_max_px": (i + 1) * (bbox_w // num_cols)}
|
|
for i in range(num_cols)]
|
|
cells = [{"row_index": r["index"], "col_index": c["index"], "col_type": f"column_{c['index']+1}",
|
|
"text": f"R{r['index']}C{c['index']}", "cell_id": f"R{r['index']}C{c['index']}",
|
|
"word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test",
|
|
"bbox_px": {"x": 0, "y": 0, "w": 100, "h": 30}, "bbox_pct": {"x": 0, "y": 0, "w": 10, "h": 2}}
|
|
for r in rows for c in cols]
|
|
return {
|
|
"zone_index": 1, "zone_type": "content",
|
|
"bbox_px": {"x": 50, "y": rows_y[0] - 10, "w": bbox_w, "h": rows_y[-1] - rows_y[0] + 50},
|
|
"bbox_pct": {"x": 3, "y": 10, "w": 85, "h": 80},
|
|
"columns": cols, "rows": rows, "cells": cells,
|
|
"header_rows": [], "border": None, "word_count": len(cells),
|
|
}
|
|
|
|
|
|
def _make_box_zone(zone_index, bbox, cells_data, bg_hex="#2563eb", layout_type="flowing"):
|
|
"""Helper: create a box zone."""
|
|
rows = [{"index": i, "y_min_px": bbox["y"] + i * 30, "y_max_px": bbox["y"] + (i + 1) * 30,
|
|
"is_header": i == 0} for i in range(len(cells_data))]
|
|
cols = [{"index": 0, "x_min_px": bbox["x"], "x_max_px": bbox["x"] + bbox["w"]}]
|
|
cells = [{"row_index": i, "col_index": 0, "col_type": "column_1",
|
|
"text": text, "cell_id": f"Z{zone_index}_R{i}C0",
|
|
"word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test",
|
|
"bbox_px": {"x": bbox["x"], "y": bbox["y"] + i * 30, "w": bbox["w"], "h": 30},
|
|
"bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10}}
|
|
for i, text in enumerate(cells_data)]
|
|
return {
|
|
"zone_index": zone_index, "zone_type": "box",
|
|
"bbox_px": bbox, "bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10},
|
|
"columns": cols, "rows": rows, "cells": cells,
|
|
"header_rows": [0], "border": None, "word_count": len(cells),
|
|
"box_bg_hex": bg_hex, "box_bg_color": "blue", "box_layout_type": layout_type,
|
|
}
|
|
|
|
|
|
class TestDominantRowHeight:
|
|
|
|
def test_regular_spacing(self):
|
|
"""Rows with uniform spacing → median = that spacing."""
|
|
zone = _make_content_zone([100, 147, 194, 241, 288])
|
|
h = _compute_dominant_row_height(zone)
|
|
assert h == 47
|
|
|
|
def test_filters_large_gaps(self):
|
|
"""Large gaps (box interruptions) are filtered out."""
|
|
zone = _make_content_zone([100, 147, 194, 600, 647, 694])
|
|
# spacings: 47, 47, 406(!), 47, 47 → filter >100 → median of [47,47,47,47] = 47
|
|
h = _compute_dominant_row_height(zone)
|
|
assert h == 47
|
|
|
|
def test_single_row(self):
|
|
"""Single row → default 47."""
|
|
zone = _make_content_zone([100])
|
|
h = _compute_dominant_row_height(zone)
|
|
assert h == 47.0
|
|
|
|
|
|
class TestClassifyBoxes:
|
|
|
|
def test_full_width(self):
|
|
"""Box wider than 85% of content → full_width."""
|
|
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 1300, "h": 200}, ["Header", "Text"])]
|
|
result = _classify_boxes(boxes, content_width=1400)
|
|
assert result[0]["classification"] == "full_width"
|
|
|
|
def test_partial_width_right(self):
|
|
"""Narrow box on right side → partial_width, side=right."""
|
|
boxes = [_make_box_zone(2, {"x": 800, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])]
|
|
result = _classify_boxes(boxes, content_width=1400)
|
|
assert result[0]["classification"] == "partial_width"
|
|
assert result[0]["side"] == "right"
|
|
|
|
def test_partial_width_left(self):
|
|
"""Narrow box on left side → partial_width, side=left."""
|
|
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])]
|
|
result = _classify_boxes(boxes, content_width=1400)
|
|
assert result[0]["classification"] == "partial_width"
|
|
assert result[0]["side"] == "left"
|
|
|
|
def test_text_line_count(self):
|
|
"""Total text lines counted including \\n."""
|
|
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200},
|
|
["Header", "Line1\nLine2\nLine3"])]
|
|
result = _classify_boxes(boxes, content_width=1400)
|
|
assert result[0]["total_lines"] == 4 # "Header" (1) + "Line1\nLine2\nLine3" (3)
|
|
|
|
|
|
class TestBuildUnifiedGrid:
|
|
|
|
def test_content_only(self):
|
|
"""Content zone without boxes → single unified zone."""
|
|
content = _make_content_zone([100, 147, 194, 241])
|
|
result = build_unified_grid([content], 1600, 2200, {})
|
|
assert result["is_unified"] is True
|
|
assert len(result["zones"]) == 1
|
|
assert result["zones"][0]["zone_type"] == "unified"
|
|
assert result["summary"]["total_rows"] == 4
|
|
|
|
def test_full_width_box_integration(self):
|
|
"""Full-width box rows are integrated into unified grid."""
|
|
content = _make_content_zone([100, 147, 194, 600, 647])
|
|
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200},
|
|
["Box Header", "Box Row 1", "Box Row 2"])
|
|
result = build_unified_grid([content, box], 1600, 2200, {})
|
|
assert result["is_unified"] is True
|
|
total_rows = result["summary"]["total_rows"]
|
|
# 5 content rows + 3 box rows = 8
|
|
assert total_rows == 8
|
|
|
|
def test_box_cells_tagged(self):
|
|
"""Box-origin cells have source_zone_type and box_region."""
|
|
content = _make_content_zone([100, 147, 600, 647])
|
|
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200}, ["Box Text"])
|
|
result = build_unified_grid([content, box], 1600, 2200, {})
|
|
box_cells = [c for c in result["zones"][0]["cells"] if c.get("source_zone_type") == "box"]
|
|
assert len(box_cells) > 0
|
|
assert box_cells[0]["box_region"]["bg_hex"] == "#2563eb"
|
|
|
|
def test_no_content_zone(self):
|
|
"""No content zone → returns zones as-is."""
|
|
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 500, "h": 200}, ["Text"])
|
|
result = build_unified_grid([box], 1600, 2200, {})
|
|
assert "zones" in result
|