Add tests for unified_grid and cv_box_layout
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 50s
CI / test-go-edu-search (push) Successful in 45s
CI / test-python-klausur (push) Failing after 2m30s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 34s

test_unified_grid.py (10 tests):
- Dominant row height calculation (regular, gaps filtered, single row)
- Box classification (full-width, partial left/right, text line count)
- Unified grid building (content-only, box integration, cell tagging)

test_box_layout.py (13 tests):
- Layout classification (header_only, flowing, bullet_list)
- Line grouping by y-proximity
- Flowing layout indent grouping (bullet + continuations → \n)
- Row/column field completeness for GridTable compatibility

Total: 66 tests passing (43 smart_spell + 13 box_layout + 10 unified)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-15 18:18:52 +02:00
parent 1d22f649ae
commit 3d3c2b30db
2 changed files with 265 additions and 0 deletions

View File

@@ -0,0 +1,124 @@
"""Tests for cv_box_layout.py — box layout classification and grid building."""
import pytest
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from cv_box_layout import classify_box_layout, build_box_zone_grid, _group_into_lines
def _make_words(lines_data):
"""Create word dicts from [(y, x, text), ...] tuples."""
words = []
for y, x, text in lines_data:
words.append({"top": y, "left": x, "width": len(text) * 10, "height": 25, "text": text})
return words
class TestClassifyBoxLayout:
def test_header_only(self):
words = _make_words([(100, 50, "Unit 3")])
assert classify_box_layout(words, 500, 50) == "header_only"
def test_empty(self):
assert classify_box_layout([], 500, 200) == "header_only"
def test_flowing(self):
"""Multiple lines without bullet patterns → flowing."""
words = _make_words([
(100, 50, "German leihen title"),
(130, 50, "etwas ausleihen von jm"),
(160, 70, "borrow sth from sb"),
(190, 70, "Can I borrow your CD"),
(220, 50, "etwas verleihen an jn"),
(250, 70, "OK I can lend you my"),
])
assert classify_box_layout(words, 500, 200) == "flowing"
def test_bullet_list(self):
"""Lines starting with bullet markers → bullet_list."""
words = _make_words([
(100, 50, "Title of the box"),
(130, 50, "• First item text here"),
(160, 50, "• Second item text here"),
(190, 50, "• Third item text here"),
(220, 50, "• Fourth item text here"),
(250, 50, "• Fifth item text here"),
])
assert classify_box_layout(words, 500, 150) == "bullet_list"
class TestGroupIntoLines:
def test_single_line(self):
words = _make_words([(100, 50, "hello"), (100, 120, "world")])
lines = _group_into_lines(words)
assert len(lines) == 1
assert len(lines[0]) == 2
def test_two_lines(self):
words = _make_words([(100, 50, "line1"), (150, 50, "line2")])
lines = _group_into_lines(words)
assert len(lines) == 2
def test_y_proximity(self):
"""Words within y-tolerance are on same line."""
words = _make_words([(100, 50, "a"), (103, 120, "b")]) # 3px apart
lines = _group_into_lines(words)
assert len(lines) == 1
class TestBuildBoxZoneGrid:
def test_flowing_groups_by_indent(self):
"""Flowing layout groups continuation lines by indentation."""
words = _make_words([
(100, 50, "Header Title"),
(130, 50, "Bullet start text"),
(160, 80, "continuation line 1"),
(190, 80, "continuation line 2"),
])
result = build_box_zone_grid(words, 40, 90, 500, 120, 0, 1600, 2200, layout_type="flowing")
# Header + 1 grouped bullet = 2 rows
assert len(result["rows"]) == 2
assert len(result["cells"]) == 2
# Second cell should have \n (multi-line)
bullet_cell = result["cells"][1]
assert "\n" in bullet_cell["text"]
def test_header_only_single_cell(self):
words = _make_words([(100, 50, "Just a title")])
result = build_box_zone_grid(words, 40, 90, 500, 50, 0, 1600, 2200, layout_type="header_only")
assert len(result["cells"]) == 1
assert result["box_layout_type"] == "header_only"
def test_columnar_delegates_to_zone_grid(self):
"""Columnar layout uses standard grid builder."""
words = _make_words([
(100, 50, "Col A header"),
(100, 300, "Col B header"),
(130, 50, "A data"),
(130, 300, "B data"),
])
result = build_box_zone_grid(words, 40, 90, 500, 80, 0, 1600, 2200, layout_type="columnar")
assert result["box_layout_type"] == "columnar"
# Should have detected columns
assert len(result.get("columns", [])) >= 1
def test_row_fields_for_gridtable(self):
"""Rows must have y_min_px, y_max_px, is_header for GridTable."""
words = _make_words([(100, 50, "Title"), (130, 50, "Body")])
result = build_box_zone_grid(words, 40, 90, 500, 80, 0, 1600, 2200, layout_type="flowing")
for row in result["rows"]:
assert "y_min_px" in row
assert "y_max_px" in row
assert "is_header" in row
def test_column_fields_for_gridtable(self):
"""Columns must have x_min_px, x_max_px for GridTable width calculation."""
words = _make_words([(100, 50, "Text")])
result = build_box_zone_grid(words, 40, 90, 500, 50, 0, 1600, 2200, layout_type="flowing")
for col in result["columns"]:
assert "x_min_px" in col
assert "x_max_px" in col

View File

@@ -0,0 +1,141 @@
"""Tests for unified_grid.py — merging multi-zone grids into single zone."""
import pytest
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from unified_grid import (
_compute_dominant_row_height,
_classify_boxes,
build_unified_grid,
)
def _make_content_zone(rows_y, num_cols=4, bbox_w=1400):
"""Helper: create a content zone with rows at given y positions."""
rows = [{"index": i, "y_min_px": y, "y_max_px": y + 30, "y_min": y, "y_max": y + 30,
"is_header": False} for i, y in enumerate(rows_y)]
cols = [{"index": i, "x_min_px": i * (bbox_w // num_cols), "x_max_px": (i + 1) * (bbox_w // num_cols)}
for i in range(num_cols)]
cells = [{"row_index": r["index"], "col_index": c["index"], "col_type": f"column_{c['index']+1}",
"text": f"R{r['index']}C{c['index']}", "cell_id": f"R{r['index']}C{c['index']}",
"word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test",
"bbox_px": {"x": 0, "y": 0, "w": 100, "h": 30}, "bbox_pct": {"x": 0, "y": 0, "w": 10, "h": 2}}
for r in rows for c in cols]
return {
"zone_index": 1, "zone_type": "content",
"bbox_px": {"x": 50, "y": rows_y[0] - 10, "w": bbox_w, "h": rows_y[-1] - rows_y[0] + 50},
"bbox_pct": {"x": 3, "y": 10, "w": 85, "h": 80},
"columns": cols, "rows": rows, "cells": cells,
"header_rows": [], "border": None, "word_count": len(cells),
}
def _make_box_zone(zone_index, bbox, cells_data, bg_hex="#2563eb", layout_type="flowing"):
"""Helper: create a box zone."""
rows = [{"index": i, "y_min_px": bbox["y"] + i * 30, "y_max_px": bbox["y"] + (i + 1) * 30,
"is_header": i == 0} for i in range(len(cells_data))]
cols = [{"index": 0, "x_min_px": bbox["x"], "x_max_px": bbox["x"] + bbox["w"]}]
cells = [{"row_index": i, "col_index": 0, "col_type": "column_1",
"text": text, "cell_id": f"Z{zone_index}_R{i}C0",
"word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test",
"bbox_px": {"x": bbox["x"], "y": bbox["y"] + i * 30, "w": bbox["w"], "h": 30},
"bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10}}
for i, text in enumerate(cells_data)]
return {
"zone_index": zone_index, "zone_type": "box",
"bbox_px": bbox, "bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10},
"columns": cols, "rows": rows, "cells": cells,
"header_rows": [0], "border": None, "word_count": len(cells),
"box_bg_hex": bg_hex, "box_bg_color": "blue", "box_layout_type": layout_type,
}
class TestDominantRowHeight:
def test_regular_spacing(self):
"""Rows with uniform spacing → median = that spacing."""
zone = _make_content_zone([100, 147, 194, 241, 288])
h = _compute_dominant_row_height(zone)
assert h == 47
def test_filters_large_gaps(self):
"""Large gaps (box interruptions) are filtered out."""
zone = _make_content_zone([100, 147, 194, 600, 647, 694])
# spacings: 47, 47, 406(!), 47, 47 → filter >100 → median of [47,47,47,47] = 47
h = _compute_dominant_row_height(zone)
assert h == 47
def test_single_row(self):
"""Single row → default 47."""
zone = _make_content_zone([100])
h = _compute_dominant_row_height(zone)
assert h == 47.0
class TestClassifyBoxes:
def test_full_width(self):
"""Box wider than 85% of content → full_width."""
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 1300, "h": 200}, ["Header", "Text"])]
result = _classify_boxes(boxes, content_width=1400)
assert result[0]["classification"] == "full_width"
def test_partial_width_right(self):
"""Narrow box on right side → partial_width, side=right."""
boxes = [_make_box_zone(2, {"x": 800, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])]
result = _classify_boxes(boxes, content_width=1400)
assert result[0]["classification"] == "partial_width"
assert result[0]["side"] == "right"
def test_partial_width_left(self):
"""Narrow box on left side → partial_width, side=left."""
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])]
result = _classify_boxes(boxes, content_width=1400)
assert result[0]["classification"] == "partial_width"
assert result[0]["side"] == "left"
def test_text_line_count(self):
"""Total text lines counted including \\n."""
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200},
["Header", "Line1\nLine2\nLine3"])]
result = _classify_boxes(boxes, content_width=1400)
assert result[0]["total_lines"] == 4 # "Header" (1) + "Line1\nLine2\nLine3" (3)
class TestBuildUnifiedGrid:
def test_content_only(self):
"""Content zone without boxes → single unified zone."""
content = _make_content_zone([100, 147, 194, 241])
result = build_unified_grid([content], 1600, 2200, {})
assert result["is_unified"] is True
assert len(result["zones"]) == 1
assert result["zones"][0]["zone_type"] == "unified"
assert result["summary"]["total_rows"] == 4
def test_full_width_box_integration(self):
"""Full-width box rows are integrated into unified grid."""
content = _make_content_zone([100, 147, 194, 600, 647])
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200},
["Box Header", "Box Row 1", "Box Row 2"])
result = build_unified_grid([content, box], 1600, 2200, {})
assert result["is_unified"] is True
total_rows = result["summary"]["total_rows"]
# 5 content rows + 3 box rows = 8
assert total_rows == 8
def test_box_cells_tagged(self):
"""Box-origin cells have source_zone_type and box_region."""
content = _make_content_zone([100, 147, 600, 647])
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200}, ["Box Text"])
result = build_unified_grid([content, box], 1600, 2200, {})
box_cells = [c for c in result["zones"][0]["cells"] if c.get("source_zone_type") == "box"]
assert len(box_cells) > 0
assert box_cells[0]["box_region"]["bg_hex"] == "#2563eb"
def test_no_content_zone(self):
"""No content zone → returns zones as-is."""
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 500, "h": 200}, ["Text"])
result = build_unified_grid([box], 1600, 2200, {})
assert "zones" in result