From 3d3c2b30db7ca2a52c740c409315622f721f97b2 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 15 Apr 2026 18:18:52 +0200 Subject: [PATCH] Add tests for unified_grid and cv_box_layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_unified_grid.py (10 tests): - Dominant row height calculation (regular, gaps filtered, single row) - Box classification (full-width, partial left/right, text line count) - Unified grid building (content-only, box integration, cell tagging) test_box_layout.py (13 tests): - Layout classification (header_only, flowing, bullet_list) - Line grouping by y-proximity - Flowing layout indent grouping (bullet + continuations → \n) - Row/column field completeness for GridTable compatibility Total: 66 tests passing (43 smart_spell + 13 box_layout + 10 unified) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/tests/test_box_layout.py | 124 +++++++++++++++ .../backend/tests/test_unified_grid.py | 141 ++++++++++++++++++ 2 files changed, 265 insertions(+) create mode 100644 klausur-service/backend/tests/test_box_layout.py create mode 100644 klausur-service/backend/tests/test_unified_grid.py diff --git a/klausur-service/backend/tests/test_box_layout.py b/klausur-service/backend/tests/test_box_layout.py new file mode 100644 index 0000000..a1cf428 --- /dev/null +++ b/klausur-service/backend/tests/test_box_layout.py @@ -0,0 +1,124 @@ +"""Tests for cv_box_layout.py — box layout classification and grid building.""" + +import pytest +import sys, os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from cv_box_layout import classify_box_layout, build_box_zone_grid, _group_into_lines + + +def _make_words(lines_data): + """Create word dicts from [(y, x, text), ...] tuples.""" + words = [] + for y, x, text in lines_data: + words.append({"top": y, "left": x, "width": len(text) * 10, "height": 25, "text": text}) + return words + + +class TestClassifyBoxLayout: + + def test_header_only(self): + words = _make_words([(100, 50, "Unit 3")]) + assert classify_box_layout(words, 500, 50) == "header_only" + + def test_empty(self): + assert classify_box_layout([], 500, 200) == "header_only" + + def test_flowing(self): + """Multiple lines without bullet patterns → flowing.""" + words = _make_words([ + (100, 50, "German leihen title"), + (130, 50, "etwas ausleihen von jm"), + (160, 70, "borrow sth from sb"), + (190, 70, "Can I borrow your CD"), + (220, 50, "etwas verleihen an jn"), + (250, 70, "OK I can lend you my"), + ]) + assert classify_box_layout(words, 500, 200) == "flowing" + + def test_bullet_list(self): + """Lines starting with bullet markers → bullet_list.""" + words = _make_words([ + (100, 50, "Title of the box"), + (130, 50, "• First item text here"), + (160, 50, "• Second item text here"), + (190, 50, "• Third item text here"), + (220, 50, "• Fourth item text here"), + (250, 50, "• Fifth item text here"), + ]) + assert classify_box_layout(words, 500, 150) == "bullet_list" + + +class TestGroupIntoLines: + + def test_single_line(self): + words = _make_words([(100, 50, "hello"), (100, 120, "world")]) + lines = _group_into_lines(words) + assert len(lines) == 1 + assert len(lines[0]) == 2 + + def test_two_lines(self): + words = _make_words([(100, 50, "line1"), (150, 50, "line2")]) + lines = _group_into_lines(words) + assert len(lines) == 2 + + def test_y_proximity(self): + """Words within y-tolerance are on same line.""" + words = _make_words([(100, 50, "a"), (103, 120, "b")]) # 3px apart + lines = _group_into_lines(words) + assert len(lines) == 1 + + +class TestBuildBoxZoneGrid: + + def test_flowing_groups_by_indent(self): + """Flowing layout groups continuation lines by indentation.""" + words = _make_words([ + (100, 50, "Header Title"), + (130, 50, "Bullet start text"), + (160, 80, "continuation line 1"), + (190, 80, "continuation line 2"), + ]) + result = build_box_zone_grid(words, 40, 90, 500, 120, 0, 1600, 2200, layout_type="flowing") + # Header + 1 grouped bullet = 2 rows + assert len(result["rows"]) == 2 + assert len(result["cells"]) == 2 + # Second cell should have \n (multi-line) + bullet_cell = result["cells"][1] + assert "\n" in bullet_cell["text"] + + def test_header_only_single_cell(self): + words = _make_words([(100, 50, "Just a title")]) + result = build_box_zone_grid(words, 40, 90, 500, 50, 0, 1600, 2200, layout_type="header_only") + assert len(result["cells"]) == 1 + assert result["box_layout_type"] == "header_only" + + def test_columnar_delegates_to_zone_grid(self): + """Columnar layout uses standard grid builder.""" + words = _make_words([ + (100, 50, "Col A header"), + (100, 300, "Col B header"), + (130, 50, "A data"), + (130, 300, "B data"), + ]) + result = build_box_zone_grid(words, 40, 90, 500, 80, 0, 1600, 2200, layout_type="columnar") + assert result["box_layout_type"] == "columnar" + # Should have detected columns + assert len(result.get("columns", [])) >= 1 + + def test_row_fields_for_gridtable(self): + """Rows must have y_min_px, y_max_px, is_header for GridTable.""" + words = _make_words([(100, 50, "Title"), (130, 50, "Body")]) + result = build_box_zone_grid(words, 40, 90, 500, 80, 0, 1600, 2200, layout_type="flowing") + for row in result["rows"]: + assert "y_min_px" in row + assert "y_max_px" in row + assert "is_header" in row + + def test_column_fields_for_gridtable(self): + """Columns must have x_min_px, x_max_px for GridTable width calculation.""" + words = _make_words([(100, 50, "Text")]) + result = build_box_zone_grid(words, 40, 90, 500, 50, 0, 1600, 2200, layout_type="flowing") + for col in result["columns"]: + assert "x_min_px" in col + assert "x_max_px" in col diff --git a/klausur-service/backend/tests/test_unified_grid.py b/klausur-service/backend/tests/test_unified_grid.py new file mode 100644 index 0000000..9e9375d --- /dev/null +++ b/klausur-service/backend/tests/test_unified_grid.py @@ -0,0 +1,141 @@ +"""Tests for unified_grid.py — merging multi-zone grids into single zone.""" + +import pytest +import sys, os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from unified_grid import ( + _compute_dominant_row_height, + _classify_boxes, + build_unified_grid, +) + + +def _make_content_zone(rows_y, num_cols=4, bbox_w=1400): + """Helper: create a content zone with rows at given y positions.""" + rows = [{"index": i, "y_min_px": y, "y_max_px": y + 30, "y_min": y, "y_max": y + 30, + "is_header": False} for i, y in enumerate(rows_y)] + cols = [{"index": i, "x_min_px": i * (bbox_w // num_cols), "x_max_px": (i + 1) * (bbox_w // num_cols)} + for i in range(num_cols)] + cells = [{"row_index": r["index"], "col_index": c["index"], "col_type": f"column_{c['index']+1}", + "text": f"R{r['index']}C{c['index']}", "cell_id": f"R{r['index']}C{c['index']}", + "word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test", + "bbox_px": {"x": 0, "y": 0, "w": 100, "h": 30}, "bbox_pct": {"x": 0, "y": 0, "w": 10, "h": 2}} + for r in rows for c in cols] + return { + "zone_index": 1, "zone_type": "content", + "bbox_px": {"x": 50, "y": rows_y[0] - 10, "w": bbox_w, "h": rows_y[-1] - rows_y[0] + 50}, + "bbox_pct": {"x": 3, "y": 10, "w": 85, "h": 80}, + "columns": cols, "rows": rows, "cells": cells, + "header_rows": [], "border": None, "word_count": len(cells), + } + + +def _make_box_zone(zone_index, bbox, cells_data, bg_hex="#2563eb", layout_type="flowing"): + """Helper: create a box zone.""" + rows = [{"index": i, "y_min_px": bbox["y"] + i * 30, "y_max_px": bbox["y"] + (i + 1) * 30, + "is_header": i == 0} for i in range(len(cells_data))] + cols = [{"index": 0, "x_min_px": bbox["x"], "x_max_px": bbox["x"] + bbox["w"]}] + cells = [{"row_index": i, "col_index": 0, "col_type": "column_1", + "text": text, "cell_id": f"Z{zone_index}_R{i}C0", + "word_boxes": [], "confidence": 90, "is_bold": False, "ocr_engine": "test", + "bbox_px": {"x": bbox["x"], "y": bbox["y"] + i * 30, "w": bbox["w"], "h": 30}, + "bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10}} + for i, text in enumerate(cells_data)] + return { + "zone_index": zone_index, "zone_type": "box", + "bbox_px": bbox, "bbox_pct": {"x": 50, "y": 50, "w": 30, "h": 10}, + "columns": cols, "rows": rows, "cells": cells, + "header_rows": [0], "border": None, "word_count": len(cells), + "box_bg_hex": bg_hex, "box_bg_color": "blue", "box_layout_type": layout_type, + } + + +class TestDominantRowHeight: + + def test_regular_spacing(self): + """Rows with uniform spacing → median = that spacing.""" + zone = _make_content_zone([100, 147, 194, 241, 288]) + h = _compute_dominant_row_height(zone) + assert h == 47 + + def test_filters_large_gaps(self): + """Large gaps (box interruptions) are filtered out.""" + zone = _make_content_zone([100, 147, 194, 600, 647, 694]) + # spacings: 47, 47, 406(!), 47, 47 → filter >100 → median of [47,47,47,47] = 47 + h = _compute_dominant_row_height(zone) + assert h == 47 + + def test_single_row(self): + """Single row → default 47.""" + zone = _make_content_zone([100]) + h = _compute_dominant_row_height(zone) + assert h == 47.0 + + +class TestClassifyBoxes: + + def test_full_width(self): + """Box wider than 85% of content → full_width.""" + boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 1300, "h": 200}, ["Header", "Text"])] + result = _classify_boxes(boxes, content_width=1400) + assert result[0]["classification"] == "full_width" + + def test_partial_width_right(self): + """Narrow box on right side → partial_width, side=right.""" + boxes = [_make_box_zone(2, {"x": 800, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])] + result = _classify_boxes(boxes, content_width=1400) + assert result[0]["classification"] == "partial_width" + assert result[0]["side"] == "right" + + def test_partial_width_left(self): + """Narrow box on left side → partial_width, side=left.""" + boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200}, ["Header", "Text"])] + result = _classify_boxes(boxes, content_width=1400) + assert result[0]["classification"] == "partial_width" + assert result[0]["side"] == "left" + + def test_text_line_count(self): + """Total text lines counted including \\n.""" + boxes = [_make_box_zone(2, {"x": 50, "y": 500, "w": 500, "h": 200}, + ["Header", "Line1\nLine2\nLine3"])] + result = _classify_boxes(boxes, content_width=1400) + assert result[0]["total_lines"] == 4 # "Header" (1) + "Line1\nLine2\nLine3" (3) + + +class TestBuildUnifiedGrid: + + def test_content_only(self): + """Content zone without boxes → single unified zone.""" + content = _make_content_zone([100, 147, 194, 241]) + result = build_unified_grid([content], 1600, 2200, {}) + assert result["is_unified"] is True + assert len(result["zones"]) == 1 + assert result["zones"][0]["zone_type"] == "unified" + assert result["summary"]["total_rows"] == 4 + + def test_full_width_box_integration(self): + """Full-width box rows are integrated into unified grid.""" + content = _make_content_zone([100, 147, 194, 600, 647]) + box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200}, + ["Box Header", "Box Row 1", "Box Row 2"]) + result = build_unified_grid([content, box], 1600, 2200, {}) + assert result["is_unified"] is True + total_rows = result["summary"]["total_rows"] + # 5 content rows + 3 box rows = 8 + assert total_rows == 8 + + def test_box_cells_tagged(self): + """Box-origin cells have source_zone_type and box_region.""" + content = _make_content_zone([100, 147, 600, 647]) + box = _make_box_zone(2, {"x": 50, "y": 300, "w": 1300, "h": 200}, ["Box Text"]) + result = build_unified_grid([content, box], 1600, 2200, {}) + box_cells = [c for c in result["zones"][0]["cells"] if c.get("source_zone_type") == "box"] + assert len(box_cells) > 0 + assert box_cells[0]["box_region"]["bg_hex"] == "#2563eb" + + def test_no_content_zone(self): + """No content zone → returns zones as-is.""" + box = _make_box_zone(2, {"x": 50, "y": 300, "w": 500, "h": 200}, ["Text"]) + result = build_unified_grid([box], 1600, 2200, {}) + assert "zones" in result