Files
breakpilot-lehrer/klausur-service/backend/tests/test_grid_editor_api.py
Benjamin Admin 7ac09b5941 Filter pipe-character word_boxes from OCR column divider artifacts
Step 4d removes "|" and "||" word_boxes that OCR produces when reading
physical vertical divider lines between columns. Also strips stray pipe
chars from cell text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 12:09:50 +01:00

807 lines
36 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
Covers:
- _merge_content_zones_across_boxes: zone merging logic
- _detect_heading_rows_by_color: heading detection by color + height
- _filter_border_ghosts: single-char ghost detection
- _detect_header_rows: skip_first_row_header flag
"""
import sys
sys.path.insert(0, '/app')
import pytest
from cv_vocab_types import PageZone, DetectedBox
from grid_editor_api import (
_merge_content_zones_across_boxes,
_filter_border_ghosts,
_detect_header_rows,
_detect_heading_rows_by_color,
_detect_heading_rows_by_single_cell,
)
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell
# ---------------------------------------------------------------------------
# _merge_content_zones_across_boxes
# ---------------------------------------------------------------------------
class TestMergeContentZonesAcrossBoxes:
"""Test zone merging across box zones."""
def test_no_merge_when_less_than_3_zones(self):
"""Fewer than 3 zones → no merge possible."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
assert result[0].zone_type == "content"
assert result[1].zone_type == "box"
def test_merge_content_box_content(self):
"""[content, box, content] → [merged_content with overlay]."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 1
merged = result[0]
assert merged.zone_type == "content"
assert merged.y == 0
assert merged.height == 350 # 0 to 350
assert len(merged.image_overlays) == 1
assert merged.image_overlays[0]["y"] == 100
assert merged.image_overlays[0]["height"] == 50
def test_box_at_start_not_merged(self):
"""Box at the start (not between contents) stays separate."""
zones = [
PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400,
box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)),
PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500),
PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400,
box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)),
PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
# Box at start stays, then content+box+content merges
assert len(result) == 2
assert result[0].zone_type == "box"
assert result[1].zone_type == "content"
assert len(result[1].image_overlays) == 1
def test_consecutive_boxes_not_merged(self):
"""[content, box, box, content] → no merge (consecutive boxes rare in practice)."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380,
box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)),
PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
# Two consecutive boxes: the algorithm only merges [content, box, content]
# pairs, so consecutive boxes break the pattern.
assert len(result) == 4
def test_zone_reindexing(self):
"""Zone indices are re-numbered after merging."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert result[0].index == 0
def test_no_boxes_passthrough(self):
"""All-content zones pass through unchanged."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
def test_typical_vocab_page_pattern(self):
"""Typical pattern: [box(VOCABULARY), content, box(image), content]
→ box stays, content+box+content merges."""
zones = [
PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400,
box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)),
PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500),
PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400,
box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)),
PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
assert result[0].zone_type == "box" # VOCABULARY header box stays
assert result[1].zone_type == "content" # merged content zone
assert result[1].y == 60
assert result[1].height == 710 - 60 # 60 to 710
assert len(result[1].image_overlays) == 1
assert result[1].image_overlays[0]["y"] == 120
# Check reindexing
assert result[0].index == 0
assert result[1].index == 1
# ---------------------------------------------------------------------------
# _detect_heading_rows_by_color
# ---------------------------------------------------------------------------
class TestDetectHeadingRowsByColor:
"""Test heading detection by color + height."""
def _make_word_box(self, text, left, top, width, height, color="black"):
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"color_name": color,
"conf": 90,
}
def _make_zone(self, cells, rows, columns, zone_index=0,
bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000):
return {
"zone_index": zone_index,
"zone_type": "content",
"bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h},
"cells": cells,
"rows": rows,
"columns": columns,
}
def test_blue_heading_detected(self):
"""Row with all blue words + taller height → heading."""
# Normal rows: height ~20
normal_cells = []
for ri in range(5):
normal_cells.append({
"cell_id": f"Z0_R{ri:02d}_C0",
"zone_index": 0,
"row_index": ri,
"col_index": 0,
"col_type": "column_1",
"text": f"word_{ri}",
"word_boxes": [
self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20),
],
})
normal_cells.append({
"cell_id": f"Z0_R{ri:02d}_C1",
"zone_index": 0,
"row_index": ri,
"col_index": 1,
"col_type": "column_2",
"text": f"translation_{ri}",
"word_boxes": [
self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20),
],
})
# Heading row (index 2): blue, taller (height 25)
heading_ri = 2
for c in normal_cells:
if c["row_index"] == heading_ri:
for wb in c["word_boxes"]:
wb["color_name"] = "blue"
wb["height"] = 25 # > 1.2 * 20 = 24
rows = [
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
for ri in range(5)
]
columns = [
{"index": 0, "label": "column_1"},
{"index": 1, "label": "column_2"},
]
zones_data = [self._make_zone(normal_cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 1
# Check that row 2 is now marked as header
assert rows[2]["is_header"] is True
# Check that the heading cell was created
heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri]
assert len(heading_cells) == 1
assert heading_cells[0]["col_type"] == "heading"
assert "word_2" in heading_cells[0]["text"]
assert "translation_2" in heading_cells[0]["text"]
def test_black_row_not_heading(self):
"""Row with black words → not a heading, even if tall."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "hello",
"word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "world",
"word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_mixed_color_row_not_heading(self):
"""Row with some blue and some black words → not a heading."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "normal",
"word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_colored_but_not_tall_not_heading(self):
"""Row with all blue words but normal height → not a heading."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "four",
"word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_single_column_zone_skipped(self):
"""Zones with < 2 columns are skipped."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
},
]
rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}]
columns = [{"index": 0, "label": "column_1"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_already_header_skipped(self):
"""Rows already marked is_header are not re-detected."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "spanning_header", "text": "Header",
"word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
# ---------------------------------------------------------------------------
# _filter_border_ghosts (Fix 2: single-char only)
# ---------------------------------------------------------------------------
class TestFilterBorderGhosts:
"""Test that ghost filtering only removes single-char words."""
def test_single_char_ghost_removed(self):
"""Single '|' on a box border → filtered as ghost."""
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
words = [
{"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
{"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 1
assert len(filtered) == 1
assert filtered[0]["text"] == "hello"
def test_multi_char_ghost_kept(self):
"""Multi-char '(=' on a bordered box → NOT filtered (real content)."""
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3)
words = [
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 0
assert len(filtered) == 2
def test_borderless_box_no_ghost_filter(self):
"""Borderless box (border_thickness=0) → no ghost filtering at all."""
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
words = [
{"text": "I", "left": 643, "top": 272, "width": 6, "height": 19}, # near box edge
{"text": "|", "left": 647, "top": 200, "width": 3, "height": 10}, # even pipe on edge
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 0 # nothing filtered — borderless box
assert len(filtered) == 2
def test_single_paren_on_border_removed(self):
"""Single ')' on border → filtered."""
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
words = [
{"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 1
assert len(filtered) == 0
# ---------------------------------------------------------------------------
# Step 4d: Pipe-character divider filter
# ---------------------------------------------------------------------------
class TestPipeDividerFilter:
"""Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
def test_pipe_word_boxes_removed(self):
"""Word boxes with text '|' or '||' are removed from cells."""
zone = {
"zone_index": 0,
"cells": [
{
"cell_id": "Z0_R0_C0",
"text": "hello | world",
"word_boxes": [
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
{"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
{"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
],
},
],
"rows": [{"index": 0}],
}
# Simulate Step 4d inline
import re
_PIPE_RE = re.compile(r"^\|+$")
for cell in zone["cells"]:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
assert len(zone["cells"][0]["word_boxes"]) == 2
assert zone["cells"][0]["text"] == "hello world"
def test_pipe_only_cell_removed(self):
"""A cell containing only '|' word_boxes becomes empty and is removed."""
zone = {
"zone_index": 0,
"cells": [
{
"cell_id": "Z0_R0_C0",
"text": "hello",
"word_boxes": [
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
],
},
{
"cell_id": "Z0_R0_C1",
"text": "|",
"word_boxes": [
{"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
],
},
],
"rows": [{"index": 0}],
}
import re
_PIPE_RE = re.compile(r"^\|+$")
removed = 0
for cell in zone["cells"]:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
if removed:
zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
assert removed == 1
assert len(zone["cells"]) == 1
assert zone["cells"][0]["text"] == "hello"
def test_double_pipe_removed(self):
"""'||' is also treated as a divider artifact."""
import re
_PIPE_RE = re.compile(r"^\|+$")
assert _PIPE_RE.match("||") is not None
assert _PIPE_RE.match("|") is not None
assert _PIPE_RE.match("hello") is None
assert _PIPE_RE.match("|word") is None
# ---------------------------------------------------------------------------
# _detect_header_rows (Fix 3: skip_first_row_header)
# ---------------------------------------------------------------------------
class TestDetectHeaderRowsSkipFlag:
"""Test skip_first_row_header flag."""
def test_first_row_detected_without_flag(self):
"""Without flag, first row with big gap → header."""
rows = [
{"y_min": 100, "y_max": 120, "index": 0},
{"y_min": 160, "y_max": 180, "index": 1},
{"y_min": 185, "y_max": 205, "index": 2},
]
words = [
{"height": 20, "top": 105, "left": 10, "width": 80},
{"height": 20, "top": 165, "left": 10, "width": 80},
{"height": 20, "top": 190, "left": 10, "width": 80},
]
headers = _detect_header_rows(rows, words, 0)
assert 0 in headers
def test_first_row_skipped_with_flag(self):
"""With skip flag, first row NOT detected even with big gap."""
rows = [
{"y_min": 100, "y_max": 120, "index": 0},
{"y_min": 160, "y_max": 180, "index": 1},
{"y_min": 185, "y_max": 205, "index": 2},
]
words = [
{"height": 20, "top": 105, "left": 10, "width": 80},
{"height": 20, "top": 165, "left": 10, "width": 80},
{"height": 20, "top": 190, "left": 10, "width": 80},
]
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
assert 0 not in headers
# ---------------------------------------------------------------------------
# _text_has_garbled_ipa + fix_ipa_continuation_cell
# ---------------------------------------------------------------------------
class TestGarbledIpaDetection:
"""Test detection and fixing of garbled IPA in bracket notation."""
def test_bracket_garbled_no_ipa_chars(self):
"""'[n, nn]' — brackets with no real IPA chars → garbled."""
assert _text_has_garbled_ipa("[n, nn]") is True
def test_bracket_garbled_alphanumeric(self):
"""'[1uedtX,1]' — brackets with digits/letters → garbled."""
assert _text_has_garbled_ipa("[1uedtX,1]") is True
def test_bracket_valid_ipa_detected(self):
"""'[ɪkwˈɪpmənt]' — brackets with real IPA → detected (has IPA chars).
Note: _text_has_garbled_ipa detects IPA-like fragments in text.
Valid IPA also triggers it; callers use a separate check
(re.search for proper IPA brackets) to skip already-correct IPA.
"""
assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is True
def test_no_brackets_normal_word(self):
"""'equipment' — normal word → not garbled."""
assert _text_has_garbled_ipa("equipment") is False
def test_fix_continuation_united_kingdom(self):
"""IPA continuation for 'the United Kingdom' → IPA without 'the'."""
fixed = fix_ipa_continuation_cell(
"[n, nn]", "the United Kingdom", pronunciation="british",
)
# Should contain proper IPA, not the garbled text
assert fixed != "[n, nn]"
assert "kˈɪŋdəm" in fixed # Kingdom IPA
assert "ðə" not in fixed # "the" must NOT get IPA
def test_fix_continuation_equipment(self):
"""IPA continuation for 'equipment' → proper IPA."""
fixed = fix_ipa_continuation_cell(
"[1uedtX,1]", "equipment (no pl)", pronunciation="british",
)
assert fixed != "[1uedtX,1]"
assert "ɪkwˈɪpmənt" in fixed # equipment IPA
def test_fix_continuation_close_down(self):
"""IPA continuation for 'close sth. down' → IPA for both words."""
fixed = fix_ipa_continuation_cell(
"[klaoz 'daun]", "close sth. down", pronunciation="british",
)
assert fixed != "[klaoz 'daun]"
assert "klˈəʊs" in fixed # close IPA
assert "dˈaʊn" in fixed # down IPA — must NOT be skipped
def test_continuation_skips_words_with_inline_ipa(self):
"""'beat [bˈiːt] , beat, beaten' → continuation only for 'beaten'."""
fixed = fix_ipa_continuation_cell(
"[bi:tan]", "beat [bˈiːt] , beat, beaten", pronunciation="british",
)
# Should only have IPA for "beaten", NOT for "beat" (already inline)
assert "bˈiːtən" in fixed
assert fixed.count("bˈiːt") == 0 or fixed == "[bˈiːtən]"
def test_continuation_bracket_at_end_returns_inline(self):
"""'the Highlands [ˈhaɪləndz]' → return inline IPA, not IPA for 'the'."""
fixed = fix_ipa_continuation_cell(
"'hailandz", "the Highlands [ˈhaɪləndz]", pronunciation="british",
)
assert fixed == "[ˈhaɪləndz]"
assert "ðə" not in fixed # "the" must NOT get IPA
def test_headword_with_brackets_not_continuation(self):
"""'employee [im'ploi:]' has a headword outside brackets → not garbled.
_text_has_garbled_ipa returns True (has ':'), but Step 5d should
skip this cell because text doesn't start with '['.
"""
# The garbled check still triggers (has IPA-like ':')
assert _text_has_garbled_ipa("employee [im'ploi:]") is True
# But text does NOT start with '[' — Step 5d bracket guard blocks it
text = "employee [im'ploi:]"
assert not (text.strip().startswith('[') and text.strip().endswith(']'))
# ---------------------------------------------------------------------------
# _detect_heading_rows_by_single_cell
# ---------------------------------------------------------------------------
class TestDetectHeadingRowsBySingleCell:
"""Test heading detection for black single-cell rows (e.g. 'Theme')."""
def _make_word_box(self, text, left, top, width, height, color="black"):
return {
"text": text, "left": left, "top": top,
"width": width, "height": height, "color_name": color, "conf": 90,
}
def _make_vocab_zone(self):
"""Build a typical 4-column vocab zone with 8 rows.
Columns: column_1 (page_ref), column_2 (EN), column_3 (DE), column_4 (example)
Row 4 has only 1 cell in column_2 → heading candidate ("Theme").
"""
cells = []
for ri in range(8):
if ri == 4:
# Single-cell row: "Theme" in column_2 only
cells.append({
"cell_id": f"Z0_R{ri:02d}_C1",
"zone_index": 0, "row_index": ri, "col_index": 1,
"col_type": "column_2", "text": "Theme",
"word_boxes": [self._make_word_box("Theme", 130, 100 + ri * 30, 70, 20)],
})
continue
# Normal vocab row: 3-4 cells
cells.append({
"cell_id": f"Z0_R{ri:02d}_C0",
"zone_index": 0, "row_index": ri, "col_index": 0,
"col_type": "column_1", "text": f"p.{70 + ri}",
"word_boxes": [self._make_word_box(f"p.{70+ri}", 10, 100 + ri * 30, 30, 20)],
})
cells.append({
"cell_id": f"Z0_R{ri:02d}_C1",
"zone_index": 0, "row_index": ri, "col_index": 1,
"col_type": "column_2", "text": f"word_{ri}",
"word_boxes": [self._make_word_box(f"word_{ri}", 130, 100 + ri * 30, 80, 20)],
})
cells.append({
"cell_id": f"Z0_R{ri:02d}_C2",
"zone_index": 0, "row_index": ri, "col_index": 2,
"col_type": "column_3", "text": f"Wort_{ri}",
"word_boxes": [self._make_word_box(f"Wort_{ri}", 400, 100 + ri * 30, 80, 20)],
})
cells.append({
"cell_id": f"Z0_R{ri:02d}_C3",
"zone_index": 0, "row_index": ri, "col_index": 3,
"col_type": "column_4", "text": f"Example sentence {ri}.",
"word_boxes": [self._make_word_box(f"Example", 600, 100 + ri * 30, 120, 20)],
})
rows = [
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
for ri in range(8)
]
columns = [
{"col_index": 0, "col_type": "column_1"},
{"col_index": 1, "col_type": "column_2"},
{"col_index": 2, "col_type": "column_3"},
{"col_index": 3, "col_type": "column_4"},
]
return {
"zone_index": 0, "zone_type": "content",
"bbox_px": {"x": 0, "y": 0, "w": 800, "h": 1000},
"cells": cells, "rows": rows, "columns": columns,
}
def test_single_cell_heading_detected(self):
"""Row with only 1 content cell in column_2 → heading."""
zone = self._make_vocab_zone()
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
assert count == 1
heading_cells = [c for c in zone["cells"] if c["row_index"] == 4]
assert len(heading_cells) == 1
assert heading_cells[0]["col_type"] == "heading"
assert heading_cells[0]["text"] == "Theme"
assert heading_cells[0]["col_index"] == 1 # Starts at column_2, not 0
def test_single_cell_in_last_column_not_heading(self):
"""Row with only 1 cell in column_4 (last) → NOT heading (continuation)."""
zone = self._make_vocab_zone()
# Add a single-cell row in the last column (column_4)
zone["cells"].append({
"cell_id": "Z0_R04_C3",
"zone_index": 0, "row_index": 4, "col_index": 3,
"col_type": "column_4", "text": "2. Veränderung",
"word_boxes": [self._make_word_box("2.", 600, 220, 100, 20)],
})
# Remove the "Theme" cell from row 4
zone["cells"] = [c for c in zone["cells"]
if not (c["row_index"] == 4 and c["col_index"] == 1)]
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
# Row 4 now only has column_4 → should NOT be heading
# But original row 4 "Theme" was removed, so no heading at all
assert count == 0
def test_ipa_bracket_text_not_heading(self):
"""Row with single cell starting with '[' → IPA continuation, not heading."""
zone = self._make_vocab_zone()
# Replace "Theme" with IPA continuation
for c in zone["cells"]:
if c["row_index"] == 4 and c["col_index"] == 1:
c["text"] = "ˈiːm]"
break
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
assert count == 0
def test_multi_cell_row_not_heading(self):
"""Normal vocab row with multiple cells → NOT heading."""
zone = self._make_vocab_zone()
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
# Only row 4 (Theme) should be heading, other rows have 3-4 cells
assert count == 1
# Verify normal rows are NOT marked as heading
for ri in [0, 1, 2, 3, 5, 6, 7]:
row_cells = [c for c in zone["cells"] if c["row_index"] == ri]
for c in row_cells:
assert c["col_type"] != "heading"
def test_color_heading_preserves_correct_col_index(self):
"""Color heading starting in column_2 → col_index should be 1, not 0."""
zone = self._make_vocab_zone()
# Make row 3 a color heading: blue words in column_2 and column_3 only
# (no column_1 page_ref for this row)
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 3]
zone["cells"].append({
"cell_id": "Z0_R03_C1", "zone_index": 0, "row_index": 3,
"col_index": 1, "col_type": "column_2", "text": "Unit 4:",
"word_boxes": [self._make_word_box("Unit", 130, 190, 50, 26, "blue"),
self._make_word_box("4:", 185, 190, 20, 26, "blue")],
})
zone["cells"].append({
"cell_id": "Z0_R03_C2", "zone_index": 0, "row_index": 3,
"col_index": 2, "col_type": "column_3", "text": "Scotland",
"word_boxes": [self._make_word_box("Scotland", 400, 190, 100, 26, "blue")],
})
zones_data = [zone]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 1
heading = [c for c in zone["cells"] if c["row_index"] == 3]
assert len(heading) == 1
assert heading[0]["col_type"] == "heading"
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
def test_last_row_single_cell_not_heading(self):
"""Single-cell in last row (e.g. page number '212') → NOT heading."""
zone = self._make_vocab_zone()
# Make row 7 (the last) have only 1 cell in column_2
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
zone["cells"].append({
"cell_id": "Z0_R07_C1",
"zone_index": 0, "row_index": 7, "col_index": 1,
"col_type": "column_2", "text": "two hundred and twelve",
"word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
})
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
# Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
assert count == 1
heading_cells = [c for c in zone["cells"]
if c.get("col_type") == "heading"]
assert all(c["row_index"] != 7 for c in heading_cells)