Add zone merging across images + heading detection by color/height
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 20s

Zone merging: content zones separated by box zones (images) are merged
into a single zone with image_overlays, so split tables reconnect.
Heading detection: after color annotation, rows where all words are
non-black and taller than 1.2x median are merged into spanning heading cells.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 12:22:11 +01:00
parent 2e6ab3a646
commit df30d4eae3
3 changed files with 586 additions and 0 deletions

View File

@@ -0,0 +1,360 @@
"""
Tests for grid_editor_api zone merging and heading detection.
Covers:
- _merge_content_zones_across_boxes: zone merging logic
- _detect_heading_rows_by_color: heading detection by color + height
"""
import sys
sys.path.insert(0, '/app')
import pytest
from cv_vocab_types import PageZone, DetectedBox
from grid_editor_api import (
_merge_content_zones_across_boxes,
_detect_heading_rows_by_color,
)
# ---------------------------------------------------------------------------
# _merge_content_zones_across_boxes
# ---------------------------------------------------------------------------
class TestMergeContentZonesAcrossBoxes:
"""Test zone merging across box zones."""
def test_no_merge_when_less_than_3_zones(self):
"""Fewer than 3 zones → no merge possible."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
assert result[0].zone_type == "content"
assert result[1].zone_type == "box"
def test_merge_content_box_content(self):
"""[content, box, content] → [merged_content with overlay]."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 1
merged = result[0]
assert merged.zone_type == "content"
assert merged.y == 0
assert merged.height == 350 # 0 to 350
assert len(merged.image_overlays) == 1
assert merged.image_overlays[0]["y"] == 100
assert merged.image_overlays[0]["height"] == 50
def test_box_at_start_not_merged(self):
"""Box at the start (not between contents) stays separate."""
zones = [
PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400,
box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)),
PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500),
PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400,
box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)),
PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
# Box at start stays, then content+box+content merges
assert len(result) == 2
assert result[0].zone_type == "box"
assert result[1].zone_type == "content"
assert len(result[1].image_overlays) == 1
def test_consecutive_boxes_not_merged(self):
"""[content, box, box, content] → no merge (consecutive boxes rare in practice)."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380,
box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)),
PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
# Two consecutive boxes: the algorithm only merges [content, box, content]
# pairs, so consecutive boxes break the pattern.
assert len(result) == 4
def test_zone_reindexing(self):
"""Zone indices are re-numbered after merging."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert result[0].index == 0
def test_no_boxes_passthrough(self):
"""All-content zones pass through unchanged."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
def test_typical_vocab_page_pattern(self):
"""Typical pattern: [box(VOCABULARY), content, box(image), content]
→ box stays, content+box+content merges."""
zones = [
PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400,
box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)),
PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500),
PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400,
box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)),
PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
assert result[0].zone_type == "box" # VOCABULARY header box stays
assert result[1].zone_type == "content" # merged content zone
assert result[1].y == 60
assert result[1].height == 710 - 60 # 60 to 710
assert len(result[1].image_overlays) == 1
assert result[1].image_overlays[0]["y"] == 120
# Check reindexing
assert result[0].index == 0
assert result[1].index == 1
# ---------------------------------------------------------------------------
# _detect_heading_rows_by_color
# ---------------------------------------------------------------------------
class TestDetectHeadingRowsByColor:
"""Test heading detection by color + height."""
def _make_word_box(self, text, left, top, width, height, color="black"):
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"color_name": color,
"conf": 90,
}
def _make_zone(self, cells, rows, columns, zone_index=0,
bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000):
return {
"zone_index": zone_index,
"zone_type": "content",
"bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h},
"cells": cells,
"rows": rows,
"columns": columns,
}
def test_blue_heading_detected(self):
"""Row with all blue words + taller height → heading."""
# Normal rows: height ~20
normal_cells = []
for ri in range(5):
normal_cells.append({
"cell_id": f"Z0_R{ri:02d}_C0",
"zone_index": 0,
"row_index": ri,
"col_index": 0,
"col_type": "column_1",
"text": f"word_{ri}",
"word_boxes": [
self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20),
],
})
normal_cells.append({
"cell_id": f"Z0_R{ri:02d}_C1",
"zone_index": 0,
"row_index": ri,
"col_index": 1,
"col_type": "column_2",
"text": f"translation_{ri}",
"word_boxes": [
self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20),
],
})
# Heading row (index 2): blue, taller (height 25)
heading_ri = 2
for c in normal_cells:
if c["row_index"] == heading_ri:
for wb in c["word_boxes"]:
wb["color_name"] = "blue"
wb["height"] = 25 # > 1.2 * 20 = 24
rows = [
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
for ri in range(5)
]
columns = [
{"index": 0, "label": "column_1"},
{"index": 1, "label": "column_2"},
]
zones_data = [self._make_zone(normal_cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 1
# Check that row 2 is now marked as header
assert rows[2]["is_header"] is True
# Check that the heading cell was created
heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri]
assert len(heading_cells) == 1
assert heading_cells[0]["col_type"] == "heading"
assert "word_2" in heading_cells[0]["text"]
assert "translation_2" in heading_cells[0]["text"]
def test_black_row_not_heading(self):
"""Row with black words → not a heading, even if tall."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "hello",
"word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "world",
"word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_mixed_color_row_not_heading(self):
"""Row with some blue and some black words → not a heading."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "normal",
"word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_colored_but_not_tall_not_heading(self):
"""Row with all blue words but normal height → not a heading."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "four",
"word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_single_column_zone_skipped(self):
"""Zones with < 2 columns are skipped."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
},
]
rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}]
columns = [{"index": 0, "label": "column_1"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_already_header_skipped(self):
"""Rows already marked is_header are not re-detected."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "spanning_header", "text": "Header",
"word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0