Add tests for unified_grid and cv_box_layout
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 50s
CI / test-go-edu-search (push) Successful in 45s
CI / test-python-klausur (push) Failing after 2m30s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 34s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 50s
CI / test-go-edu-search (push) Successful in 45s
CI / test-python-klausur (push) Failing after 2m30s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 34s
test_unified_grid.py (10 tests): - Dominant row height calculation (regular, gaps filtered, single row) - Box classification (full-width, partial left/right, text line count) - Unified grid building (content-only, box integration, cell tagging) test_box_layout.py (13 tests): - Layout classification (header_only, flowing, bullet_list) - Line grouping by y-proximity - Flowing layout indent grouping (bullet + continuations → \n) - Row/column field completeness for GridTable compatibility Total: 66 tests passing (43 smart_spell + 13 box_layout + 10 unified) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
124
klausur-service/backend/tests/test_box_layout.py
Normal file
124
klausur-service/backend/tests/test_box_layout.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Tests for cv_box_layout.py — box layout classification and grid building."""
|
||||
|
||||
import pytest
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from cv_box_layout import classify_box_layout, build_box_zone_grid, _group_into_lines
|
||||
|
||||
|
||||
def _make_words(lines_data):
|
||||
"""Create word dicts from [(y, x, text), ...] tuples."""
|
||||
words = []
|
||||
for y, x, text in lines_data:
|
||||
words.append({"top": y, "left": x, "width": len(text) * 10, "height": 25, "text": text})
|
||||
return words
|
||||
|
||||
|
||||
class TestClassifyBoxLayout:
|
||||
|
||||
def test_header_only(self):
|
||||
words = _make_words([(100, 50, "Unit 3")])
|
||||
assert classify_box_layout(words, 500, 50) == "header_only"
|
||||
|
||||
def test_empty(self):
|
||||
assert classify_box_layout([], 500, 200) == "header_only"
|
||||
|
||||
def test_flowing(self):
|
||||
"""Multiple lines without bullet patterns → flowing."""
|
||||
words = _make_words([
|
||||
(100, 50, "German leihen title"),
|
||||
(130, 50, "etwas ausleihen von jm"),
|
||||
(160, 70, "borrow sth from sb"),
|
||||
(190, 70, "Can I borrow your CD"),
|
||||
(220, 50, "etwas verleihen an jn"),
|
||||
(250, 70, "OK I can lend you my"),
|
||||
])
|
||||
assert classify_box_layout(words, 500, 200) == "flowing"
|
||||
|
||||
def test_bullet_list(self):
|
||||
"""Lines starting with bullet markers → bullet_list."""
|
||||
words = _make_words([
|
||||
(100, 50, "Title of the box"),
|
||||
(130, 50, "• First item text here"),
|
||||
(160, 50, "• Second item text here"),
|
||||
(190, 50, "• Third item text here"),
|
||||
(220, 50, "• Fourth item text here"),
|
||||
(250, 50, "• Fifth item text here"),
|
||||
])
|
||||
assert classify_box_layout(words, 500, 150) == "bullet_list"
|
||||
|
||||
|
||||
class TestGroupIntoLines:
|
||||
|
||||
def test_single_line(self):
|
||||
words = _make_words([(100, 50, "hello"), (100, 120, "world")])
|
||||
lines = _group_into_lines(words)
|
||||
assert len(lines) == 1
|
||||
assert len(lines[0]) == 2
|
||||
|
||||
def test_two_lines(self):
|
||||
words = _make_words([(100, 50, "line1"), (150, 50, "line2")])
|
||||
lines = _group_into_lines(words)
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_y_proximity(self):
|
||||
"""Words within y-tolerance are on same line."""
|
||||
words = _make_words([(100, 50, "a"), (103, 120, "b")]) # 3px apart
|
||||
lines = _group_into_lines(words)
|
||||
assert len(lines) == 1
|
||||
|
||||
|
||||
class TestBuildBoxZoneGrid:
|
||||
|
||||
def test_flowing_groups_by_indent(self):
|
||||
"""Flowing layout groups continuation lines by indentation."""
|
||||
words = _make_words([
|
||||
(100, 50, "Header Title"),
|
||||
(130, 50, "Bullet start text"),
|
||||
(160, 80, "continuation line 1"),
|
||||
(190, 80, "continuation line 2"),
|
||||
])
|
||||
result = build_box_zone_grid(words, 40, 90, 500, 120, 0, 1600, 2200, layout_type="flowing")
|
||||
# Header + 1 grouped bullet = 2 rows
|
||||
assert len(result["rows"]) == 2
|
||||
assert len(result["cells"]) == 2
|
||||
# Second cell should have \n (multi-line)
|
||||
bullet_cell = result["cells"][1]
|
||||
assert "\n" in bullet_cell["text"]
|
||||
|
||||
def test_header_only_single_cell(self):
|
||||
words = _make_words([(100, 50, "Just a title")])
|
||||
result = build_box_zone_grid(words, 40, 90, 500, 50, 0, 1600, 2200, layout_type="header_only")
|
||||
assert len(result["cells"]) == 1
|
||||
assert result["box_layout_type"] == "header_only"
|
||||
|
||||
def test_columnar_delegates_to_zone_grid(self):
|
||||
"""Columnar layout uses standard grid builder."""
|
||||
words = _make_words([
|
||||
(100, 50, "Col A header"),
|
||||
(100, 300, "Col B header"),
|
||||
(130, 50, "A data"),
|
||||
(130, 300, "B data"),
|
||||
])
|
||||
result = build_box_zone_grid(words, 40, 90, 500, 80, 0, 1600, 2200, layout_type="columnar")
|
||||
assert result["box_layout_type"] == "columnar"
|
||||
# Should have detected columns
|
||||
assert len(result.get("columns", [])) >= 1
|
||||
|
||||
def test_row_fields_for_gridtable(self):
|
||||
"""Rows must have y_min_px, y_max_px, is_header for GridTable."""
|
||||
words = _make_words([(100, 50, "Title"), (130, 50, "Body")])
|
||||
result = build_box_zone_grid(words, 40, 90, 500, 80, 0, 1600, 2200, layout_type="flowing")
|
||||
for row in result["rows"]:
|
||||
assert "y_min_px" in row
|
||||
assert "y_max_px" in row
|
||||
assert "is_header" in row
|
||||
|
||||
def test_column_fields_for_gridtable(self):
|
||||
"""Columns must have x_min_px, x_max_px for GridTable width calculation."""
|
||||
words = _make_words([(100, 50, "Text")])
|
||||
result = build_box_zone_grid(words, 40, 90, 500, 50, 0, 1600, 2200, layout_type="flowing")
|
||||
for col in result["columns"]:
|
||||
assert "x_min_px" in col
|
||||
assert "x_max_px" in col
|
||||
141
klausur-service/backend/tests/test_unified_grid.py
Normal file
141
klausur-service/backend/tests/test_unified_grid.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""Tests for unified_grid.py — merging multi-zone grids into single zone."""
|
||||
|
||||
import pytest
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from unified_grid import (
|
||||
_compute_dominant_row_height,
|
||||
_classify_boxes,
|
||||
build_unified_grid,
|
||||
)
|
||||
|
||||
|
||||
def _make_content_zone(rows_y, num_cols=4, bbox_w=1400):
|
||||
"""Helper: create a content zone with rows at given y positions."""
|
||||
rows = [{"index": i, "y_min_px": y, "y_max_px": y + 30, "y_min": y, "y_max": y + 30,
|
||||
"is_header": False} for i, y in enumerate(rows_y)]
|
||||
cols = [{"index": i, "x_min_px": i * (bbox_w // num_cols), "x_max_px": (i + 1) * (bbox_w // num_cols)}
|
||||
for i in range(num_cols)]
|
||||
cells = [{"row_index": r["index"], "col_index": c["index"], "col_type": f"column_{c['index']+1}",
|
||||
"text": f"R{r['index']}C{c['index']}", "cell_id": f"R{r['index']}C{c['index']}",
|
||||
"word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test",
|
||||
"bbox_px": {"x": 0, "y": 0, "w": 100, "h": 30}, "bbox_pct": {"x": 0, "y": 0, "w": 10, "h": 2}}
|
||||
for r in rows for c in cols]
|
||||
return {
|
||||
"zone_index": 1, "zone_type": "content",
|
||||
"bbox_px": {"x": 50, "y": rows_y[0] - 10, "w": bbox_w, "h": rows_y[-1] - rows_y[0] + 50},
|
||||
"bbox_pct": {"x": 3, "y": 10, "w": 85, "h": 80},
|
||||
"columns": cols, "rows": rows, "cells": cells,
|
||||
"header_rows": [], "border": None, "word_count": len(cells),
|
||||
}
|
||||
|
||||
|
||||
def _make_box_zone(zone_index, bbox, cells_data, bg_hex="#2563eb", layout_type="flowing"):
|
||||
"""Helper: create a box zone."""
|
||||
rows = [{"index": i, "y_min_px": bbox["y"] + i * 30, "y_max_px": bbox["y"] + (i + 1) * 30,
|
||||
"is_header": i == 0} for i in range(len(cells_data))]
|
||||
cols = [{"index": 0, "x_min_px": bbox["x"], "x_max_px": bbox["x"] + bbox["w"]}]
|
||||
cells = [{"row_index": i, "col_index": 0, "col_type": "column_1",
|
||||
"text": text, "cell_id": f"Z{zone_index}_R{i}C0",
|
||||
"word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test",
|
||||
"bbox_px": {"x": bbox["x"], "y": bbox["y"] + i * 30, "w": bbox["w"], "h": 30},
|
||||
"bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10}}
|
||||
for i, text in enumerate(cells_data)]
|
||||
return {
|
||||
"zone_index": zone_index, "zone_type": "box",
|
||||
"bbox_px": bbox, "bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10},
|
||||
"columns": cols, "rows": rows, "cells": cells,
|
||||
"header_rows": [0], "border": None, "word_count": len(cells),
|
||||
"box_bg_hex": bg_hex, "box_bg_color": "blue", "box_layout_type": layout_type,
|
||||
}
|
||||
|
||||
|
||||
class TestDominantRowHeight:
|
||||
|
||||
def test_regular_spacing(self):
|
||||
"""Rows with uniform spacing → median = that spacing."""
|
||||
zone = _make_content_zone([100, 147, 194, 241, 288])
|
||||
h = _compute_dominant_row_height(zone)
|
||||
assert h == 47
|
||||
|
||||
def test_filters_large_gaps(self):
|
||||
"""Large gaps (box interruptions) are filtered out."""
|
||||
zone = _make_content_zone([100, 147, 194, 600, 647, 694])
|
||||
# spacings: 47, 47, 406(!), 47, 47 → filter >100 → median of [47,47,47,47] = 47
|
||||
h = _compute_dominant_row_height(zone)
|
||||
assert h == 47
|
||||
|
||||
def test_single_row(self):
|
||||
"""Single row → default 47."""
|
||||
zone = _make_content_zone([100])
|
||||
h = _compute_dominant_row_height(zone)
|
||||
assert h == 47.0
|
||||
|
||||
|
||||
class TestClassifyBoxes:
|
||||
|
||||
def test_full_width(self):
|
||||
"""Box wider than 85% of content → full_width."""
|
||||
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 1300, "h": 200}, ["Header", "Text"])]
|
||||
result = _classify_boxes(boxes, content_width=1400)
|
||||
assert result[0]["classification"] == "full_width"
|
||||
|
||||
def test_partial_width_right(self):
|
||||
"""Narrow box on right side → partial_width, side=right."""
|
||||
boxes = [_make_box_zone(2, {"x": 800, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])]
|
||||
result = _classify_boxes(boxes, content_width=1400)
|
||||
assert result[0]["classification"] == "partial_width"
|
||||
assert result[0]["side"] == "right"
|
||||
|
||||
def test_partial_width_left(self):
|
||||
"""Narrow box on left side → partial_width, side=left."""
|
||||
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])]
|
||||
result = _classify_boxes(boxes, content_width=1400)
|
||||
assert result[0]["classification"] == "partial_width"
|
||||
assert result[0]["side"] == "left"
|
||||
|
||||
def test_text_line_count(self):
|
||||
"""Total text lines counted including \\n."""
|
||||
boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200},
|
||||
["Header", "Line1\nLine2\nLine3"])]
|
||||
result = _classify_boxes(boxes, content_width=1400)
|
||||
assert result[0]["total_lines"] == 4 # "Header" (1) + "Line1\nLine2\nLine3" (3)
|
||||
|
||||
|
||||
class TestBuildUnifiedGrid:
|
||||
|
||||
def test_content_only(self):
|
||||
"""Content zone without boxes → single unified zone."""
|
||||
content = _make_content_zone([100, 147, 194, 241])
|
||||
result = build_unified_grid([content], 1600, 2200, {})
|
||||
assert result["is_unified"] is True
|
||||
assert len(result["zones"]) == 1
|
||||
assert result["zones"][0]["zone_type"] == "unified"
|
||||
assert result["summary"]["total_rows"] == 4
|
||||
|
||||
def test_full_width_box_integration(self):
|
||||
"""Full-width box rows are integrated into unified grid."""
|
||||
content = _make_content_zone([100, 147, 194, 600, 647])
|
||||
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200},
|
||||
["Box Header", "Box Row 1", "Box Row 2"])
|
||||
result = build_unified_grid([content, box], 1600, 2200, {})
|
||||
assert result["is_unified"] is True
|
||||
total_rows = result["summary"]["total_rows"]
|
||||
# 5 content rows + 3 box rows = 8
|
||||
assert total_rows == 8
|
||||
|
||||
def test_box_cells_tagged(self):
|
||||
"""Box-origin cells have source_zone_type and box_region."""
|
||||
content = _make_content_zone([100, 147, 600, 647])
|
||||
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200}, ["Box Text"])
|
||||
result = build_unified_grid([content, box], 1600, 2200, {})
|
||||
box_cells = [c for c in result["zones"][0]["cells"] if c.get("source_zone_type") == "box"]
|
||||
assert len(box_cells) > 0
|
||||
assert box_cells[0]["box_region"]["bg_hex"] == "#2563eb"
|
||||
|
||||
def test_no_content_zone(self):
|
||||
"""No content zone → returns zones as-is."""
|
||||
box = _make_box_zone(2, {"x": 50, "y": 300, "w": 500, "h": 200}, ["Text"])
|
||||
result = build_unified_grid([box], 1600, 2200, {})
|
||||
assert "zones" in result
|
||||
Reference in New Issue
Block a user