Scanner artifacts on black text produce slight warm tint (hue ~0, sat ~60) that was misclassified as red. Now requires median_sat >= 80 specifically for red classification, since genuine red text always has high saturation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
957 lines
42 KiB
Python
957 lines
42 KiB
Python
"""
|
||
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
|
||
|
||
Covers:
|
||
- _merge_content_zones_across_boxes: zone merging logic
|
||
- _detect_heading_rows_by_color: heading detection by color + height
|
||
- _filter_border_ghosts: single-char ghost detection
|
||
- _detect_header_rows: skip_first_row_header flag
|
||
"""
|
||
|
||
import sys
|
||
sys.path.insert(0, '/app')
|
||
|
||
import cv2
|
||
import numpy as np
|
||
import pytest
|
||
from cv_vocab_types import PageZone, DetectedBox
|
||
from grid_editor_api import (
|
||
_merge_content_zones_across_boxes,
|
||
_filter_border_ghosts,
|
||
_detect_header_rows,
|
||
_detect_heading_rows_by_color,
|
||
_detect_heading_rows_by_single_cell,
|
||
)
|
||
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _merge_content_zones_across_boxes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMergeContentZonesAcrossBoxes:
|
||
"""Test zone merging across box zones."""
|
||
|
||
def test_no_merge_when_less_than_3_zones(self):
|
||
"""Fewer than 3 zones → no merge possible."""
|
||
zones = [
|
||
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
|
||
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
|
||
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
|
||
]
|
||
result = _merge_content_zones_across_boxes(zones, 0, 500)
|
||
assert len(result) == 2
|
||
assert result[0].zone_type == "content"
|
||
assert result[1].zone_type == "box"
|
||
|
||
def test_merge_content_box_content(self):
|
||
"""[content, box, content] → [merged_content with overlay]."""
|
||
zones = [
|
||
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
|
||
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
|
||
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
|
||
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
|
||
]
|
||
result = _merge_content_zones_across_boxes(zones, 0, 500)
|
||
assert len(result) == 1
|
||
merged = result[0]
|
||
assert merged.zone_type == "content"
|
||
assert merged.y == 0
|
||
assert merged.height == 350 # 0 to 350
|
||
assert len(merged.image_overlays) == 1
|
||
assert merged.image_overlays[0]["y"] == 100
|
||
assert merged.image_overlays[0]["height"] == 50
|
||
|
||
def test_box_at_start_not_merged(self):
|
||
"""Box at the start (not between contents) stays separate."""
|
||
zones = [
|
||
PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400,
|
||
box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)),
|
||
PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500),
|
||
PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400,
|
||
box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)),
|
||
PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500),
|
||
]
|
||
result = _merge_content_zones_across_boxes(zones, 0, 500)
|
||
# Box at start stays, then content+box+content merges
|
||
assert len(result) == 2
|
||
assert result[0].zone_type == "box"
|
||
assert result[1].zone_type == "content"
|
||
assert len(result[1].image_overlays) == 1
|
||
|
||
def test_consecutive_boxes_not_merged(self):
|
||
"""[content, box, box, content] → no merge (consecutive boxes rare in practice)."""
|
||
zones = [
|
||
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
|
||
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
|
||
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
|
||
PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380,
|
||
box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)),
|
||
PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500),
|
||
]
|
||
result = _merge_content_zones_across_boxes(zones, 0, 500)
|
||
# Two consecutive boxes: the algorithm only merges [content, box, content]
|
||
# pairs, so consecutive boxes break the pattern.
|
||
assert len(result) == 4
|
||
|
||
def test_zone_reindexing(self):
|
||
"""Zone indices are re-numbered after merging."""
|
||
zones = [
|
||
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
|
||
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
|
||
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
|
||
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
|
||
]
|
||
result = _merge_content_zones_across_boxes(zones, 0, 500)
|
||
assert result[0].index == 0
|
||
|
||
def test_no_boxes_passthrough(self):
|
||
"""All-content zones pass through unchanged."""
|
||
zones = [
|
||
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
|
||
PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500),
|
||
]
|
||
result = _merge_content_zones_across_boxes(zones, 0, 500)
|
||
assert len(result) == 2
|
||
|
||
def test_typical_vocab_page_pattern(self):
|
||
"""Typical pattern: [box(VOCABULARY), content, box(image), content]
|
||
→ box stays, content+box+content merges."""
|
||
zones = [
|
||
PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400,
|
||
box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)),
|
||
PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500),
|
||
PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400,
|
||
box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)),
|
||
PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500),
|
||
]
|
||
result = _merge_content_zones_across_boxes(zones, 0, 500)
|
||
assert len(result) == 2
|
||
assert result[0].zone_type == "box" # VOCABULARY header box stays
|
||
assert result[1].zone_type == "content" # merged content zone
|
||
assert result[1].y == 60
|
||
assert result[1].height == 710 - 60 # 60 to 710
|
||
assert len(result[1].image_overlays) == 1
|
||
assert result[1].image_overlays[0]["y"] == 120
|
||
# Check reindexing
|
||
assert result[0].index == 0
|
||
assert result[1].index == 1
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _detect_heading_rows_by_color
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestDetectHeadingRowsByColor:
|
||
"""Test heading detection by color + height."""
|
||
|
||
def _make_word_box(self, text, left, top, width, height, color="black"):
|
||
return {
|
||
"text": text,
|
||
"left": left,
|
||
"top": top,
|
||
"width": width,
|
||
"height": height,
|
||
"color_name": color,
|
||
"conf": 90,
|
||
}
|
||
|
||
def _make_zone(self, cells, rows, columns, zone_index=0,
|
||
bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000):
|
||
return {
|
||
"zone_index": zone_index,
|
||
"zone_type": "content",
|
||
"bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h},
|
||
"cells": cells,
|
||
"rows": rows,
|
||
"columns": columns,
|
||
}
|
||
|
||
def test_blue_heading_detected(self):
|
||
"""Row with all blue words + taller height → heading."""
|
||
# Normal rows: height ~20
|
||
normal_cells = []
|
||
for ri in range(5):
|
||
normal_cells.append({
|
||
"cell_id": f"Z0_R{ri:02d}_C0",
|
||
"zone_index": 0,
|
||
"row_index": ri,
|
||
"col_index": 0,
|
||
"col_type": "column_1",
|
||
"text": f"word_{ri}",
|
||
"word_boxes": [
|
||
self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20),
|
||
],
|
||
})
|
||
normal_cells.append({
|
||
"cell_id": f"Z0_R{ri:02d}_C1",
|
||
"zone_index": 0,
|
||
"row_index": ri,
|
||
"col_index": 1,
|
||
"col_type": "column_2",
|
||
"text": f"translation_{ri}",
|
||
"word_boxes": [
|
||
self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20),
|
||
],
|
||
})
|
||
|
||
# Heading row (index 2): blue, taller (height 25)
|
||
heading_ri = 2
|
||
for c in normal_cells:
|
||
if c["row_index"] == heading_ri:
|
||
for wb in c["word_boxes"]:
|
||
wb["color_name"] = "blue"
|
||
wb["height"] = 25 # > 1.2 * 20 = 24
|
||
|
||
rows = [
|
||
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
|
||
for ri in range(5)
|
||
]
|
||
columns = [
|
||
{"index": 0, "label": "column_1"},
|
||
{"index": 1, "label": "column_2"},
|
||
]
|
||
|
||
zones_data = [self._make_zone(normal_cells, rows, columns)]
|
||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||
|
||
assert count == 1
|
||
# Check that row 2 is now marked as header
|
||
assert rows[2]["is_header"] is True
|
||
# Check that the heading cell was created
|
||
heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri]
|
||
assert len(heading_cells) == 1
|
||
assert heading_cells[0]["col_type"] == "heading"
|
||
assert "word_2" in heading_cells[0]["text"]
|
||
assert "translation_2" in heading_cells[0]["text"]
|
||
|
||
def test_black_row_not_heading(self):
|
||
"""Row with black words → not a heading, even if tall."""
|
||
cells = [
|
||
{
|
||
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
|
||
"col_index": 0, "col_type": "column_1", "text": "hello",
|
||
"word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
|
||
"col_index": 1, "col_type": "column_2", "text": "world",
|
||
"word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
|
||
"col_index": 0, "col_type": "column_1", "text": "foo",
|
||
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
|
||
"col_index": 1, "col_type": "column_2", "text": "bar",
|
||
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
|
||
},
|
||
]
|
||
rows = [
|
||
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
|
||
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
|
||
]
|
||
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
|
||
zones_data = [self._make_zone(cells, rows, columns)]
|
||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||
assert count == 0
|
||
|
||
def test_mixed_color_row_not_heading(self):
|
||
"""Row with some blue and some black words → not a heading."""
|
||
cells = [
|
||
{
|
||
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
|
||
"col_index": 0, "col_type": "column_1", "text": "Unit",
|
||
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
|
||
"col_index": 1, "col_type": "column_2", "text": "normal",
|
||
"word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
|
||
"col_index": 0, "col_type": "column_1", "text": "foo",
|
||
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
|
||
"col_index": 1, "col_type": "column_2", "text": "bar",
|
||
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
|
||
},
|
||
]
|
||
rows = [
|
||
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
|
||
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
|
||
]
|
||
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
|
||
zones_data = [self._make_zone(cells, rows, columns)]
|
||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||
assert count == 0
|
||
|
||
def test_colored_but_not_tall_not_heading(self):
|
||
"""Row with all blue words but normal height → not a heading."""
|
||
cells = [
|
||
{
|
||
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
|
||
"col_index": 0, "col_type": "column_1", "text": "Unit",
|
||
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
|
||
"col_index": 1, "col_type": "column_2", "text": "four",
|
||
"word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
|
||
"col_index": 0, "col_type": "column_1", "text": "foo",
|
||
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
|
||
"col_index": 1, "col_type": "column_2", "text": "bar",
|
||
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
|
||
},
|
||
]
|
||
rows = [
|
||
{"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False},
|
||
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
|
||
]
|
||
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
|
||
zones_data = [self._make_zone(cells, rows, columns)]
|
||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||
assert count == 0
|
||
|
||
def test_single_column_zone_skipped(self):
|
||
"""Zones with < 2 columns are skipped."""
|
||
cells = [
|
||
{
|
||
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
|
||
"col_index": 0, "col_type": "column_1", "text": "Unit",
|
||
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
|
||
},
|
||
]
|
||
rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}]
|
||
columns = [{"index": 0, "label": "column_1"}]
|
||
zones_data = [self._make_zone(cells, rows, columns)]
|
||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||
assert count == 0
|
||
|
||
def test_already_header_skipped(self):
|
||
"""Rows already marked is_header are not re-detected."""
|
||
cells = [
|
||
{
|
||
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
|
||
"col_index": 0, "col_type": "spanning_header", "text": "Header",
|
||
"word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
|
||
"col_index": 0, "col_type": "column_1", "text": "foo",
|
||
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
|
||
"col_index": 1, "col_type": "column_2", "text": "bar",
|
||
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
|
||
},
|
||
]
|
||
rows = [
|
||
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True},
|
||
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
|
||
]
|
||
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
|
||
zones_data = [self._make_zone(cells, rows, columns)]
|
||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||
assert count == 0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _filter_border_ghosts (Fix 2: single-char only)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestFilterBorderGhosts:
|
||
"""Test that ghost filtering only removes single-char words."""
|
||
|
||
def test_single_char_ghost_removed(self):
|
||
"""Single '|' on a box border → filtered as ghost."""
|
||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
|
||
words = [
|
||
{"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
|
||
{"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
|
||
]
|
||
filtered, count = _filter_border_ghosts(words, [box])
|
||
assert count == 1
|
||
assert len(filtered) == 1
|
||
assert filtered[0]["text"] == "hello"
|
||
|
||
def test_multi_char_ghost_kept(self):
|
||
"""Multi-char '(=' on a bordered box → NOT filtered (real content)."""
|
||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3)
|
||
words = [
|
||
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
|
||
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
|
||
]
|
||
filtered, count = _filter_border_ghosts(words, [box])
|
||
assert count == 0
|
||
assert len(filtered) == 2
|
||
|
||
def test_borderless_box_no_ghost_filter(self):
|
||
"""Borderless box (border_thickness=0) → no ghost filtering at all."""
|
||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
||
words = [
|
||
{"text": "I", "left": 643, "top": 272, "width": 6, "height": 19}, # near box edge
|
||
{"text": "|", "left": 647, "top": 200, "width": 3, "height": 10}, # even pipe on edge
|
||
]
|
||
filtered, count = _filter_border_ghosts(words, [box])
|
||
assert count == 0 # nothing filtered — borderless box
|
||
assert len(filtered) == 2
|
||
|
||
def test_single_paren_on_border_removed(self):
|
||
"""Single ')' on border → filtered."""
|
||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
|
||
words = [
|
||
{"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
|
||
]
|
||
filtered, count = _filter_border_ghosts(words, [box])
|
||
assert count == 1
|
||
assert len(filtered) == 0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Step 4d: Pipe-character divider filter
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestPipeDividerFilter:
|
||
"""Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
|
||
|
||
def test_pipe_word_boxes_removed(self):
|
||
"""Word boxes with text '|' or '||' are removed from cells."""
|
||
zone = {
|
||
"zone_index": 0,
|
||
"cells": [
|
||
{
|
||
"cell_id": "Z0_R0_C0",
|
||
"text": "hello | world",
|
||
"word_boxes": [
|
||
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
|
||
{"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
|
||
{"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
|
||
],
|
||
},
|
||
],
|
||
"rows": [{"index": 0}],
|
||
}
|
||
# Simulate Step 4d inline
|
||
import re
|
||
_PIPE_RE = re.compile(r"^\|+$")
|
||
for cell in zone["cells"]:
|
||
wbs = cell.get("word_boxes") or []
|
||
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||
if len(filtered) < len(wbs):
|
||
cell["word_boxes"] = filtered
|
||
cell["text"] = " ".join(
|
||
wb.get("text", "").strip()
|
||
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||
if wb.get("text", "").strip()
|
||
)
|
||
assert len(zone["cells"][0]["word_boxes"]) == 2
|
||
assert zone["cells"][0]["text"] == "hello world"
|
||
|
||
def test_pipe_only_cell_removed(self):
|
||
"""A cell containing only '|' word_boxes becomes empty and is removed."""
|
||
zone = {
|
||
"zone_index": 0,
|
||
"cells": [
|
||
{
|
||
"cell_id": "Z0_R0_C0",
|
||
"text": "hello",
|
||
"word_boxes": [
|
||
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
|
||
],
|
||
},
|
||
{
|
||
"cell_id": "Z0_R0_C1",
|
||
"text": "|",
|
||
"word_boxes": [
|
||
{"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
|
||
],
|
||
},
|
||
],
|
||
"rows": [{"index": 0}],
|
||
}
|
||
import re
|
||
_PIPE_RE = re.compile(r"^\|+$")
|
||
removed = 0
|
||
for cell in zone["cells"]:
|
||
wbs = cell.get("word_boxes") or []
|
||
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||
if len(filtered) < len(wbs):
|
||
removed += len(wbs) - len(filtered)
|
||
cell["word_boxes"] = filtered
|
||
cell["text"] = " ".join(
|
||
wb.get("text", "").strip()
|
||
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||
if wb.get("text", "").strip()
|
||
)
|
||
if removed:
|
||
zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
|
||
assert removed == 1
|
||
assert len(zone["cells"]) == 1
|
||
assert zone["cells"][0]["text"] == "hello"
|
||
|
||
def test_double_pipe_removed(self):
|
||
"""'||' is also treated as a divider artifact."""
|
||
import re
|
||
_PIPE_RE = re.compile(r"^\|+$")
|
||
assert _PIPE_RE.match("||") is not None
|
||
assert _PIPE_RE.match("|") is not None
|
||
assert _PIPE_RE.match("hello") is None
|
||
assert _PIPE_RE.match("|word") is None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _detect_header_rows (Fix 3: skip_first_row_header)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestDetectHeaderRowsSkipFlag:
|
||
"""Test skip_first_row_header flag."""
|
||
|
||
def test_first_row_detected_without_flag(self):
|
||
"""Without flag, first row with big gap → header."""
|
||
rows = [
|
||
{"y_min": 100, "y_max": 120, "index": 0},
|
||
{"y_min": 160, "y_max": 180, "index": 1},
|
||
{"y_min": 185, "y_max": 205, "index": 2},
|
||
]
|
||
words = [
|
||
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||
]
|
||
headers = _detect_header_rows(rows, words, 0)
|
||
assert 0 in headers
|
||
|
||
def test_first_row_skipped_with_flag(self):
|
||
"""With skip flag, first row NOT detected even with big gap."""
|
||
rows = [
|
||
{"y_min": 100, "y_max": 120, "index": 0},
|
||
{"y_min": 160, "y_max": 180, "index": 1},
|
||
{"y_min": 185, "y_max": 205, "index": 2},
|
||
]
|
||
words = [
|
||
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||
]
|
||
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
|
||
assert 0 not in headers
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _text_has_garbled_ipa + fix_ipa_continuation_cell
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestGarbledIpaDetection:
|
||
"""Test detection and fixing of garbled IPA in bracket notation."""
|
||
|
||
def test_bracket_garbled_no_ipa_chars(self):
|
||
"""'[n, nn]' — brackets with no real IPA chars → garbled."""
|
||
assert _text_has_garbled_ipa("[n, nn]") is True
|
||
|
||
def test_bracket_garbled_alphanumeric(self):
|
||
"""'[1uedtX,1]' — brackets with digits/letters → garbled."""
|
||
assert _text_has_garbled_ipa("[1uedtX,1]") is True
|
||
|
||
def test_bracket_valid_ipa_detected(self):
|
||
"""'[ɪkwˈɪpmənt]' — brackets with real IPA → detected (has IPA chars).
|
||
|
||
Note: _text_has_garbled_ipa detects IPA-like fragments in text.
|
||
Valid IPA also triggers it; callers use a separate check
|
||
(re.search for proper IPA brackets) to skip already-correct IPA.
|
||
"""
|
||
assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is True
|
||
|
||
def test_no_brackets_normal_word(self):
|
||
"""'equipment' — normal word → not garbled."""
|
||
assert _text_has_garbled_ipa("equipment") is False
|
||
|
||
def test_fix_continuation_united_kingdom(self):
|
||
"""IPA continuation for 'the United Kingdom' → IPA without 'the'."""
|
||
fixed = fix_ipa_continuation_cell(
|
||
"[n, nn]", "the United Kingdom", pronunciation="british",
|
||
)
|
||
# Should contain proper IPA, not the garbled text
|
||
assert fixed != "[n, nn]"
|
||
assert "kˈɪŋdəm" in fixed # Kingdom IPA
|
||
assert "ðə" not in fixed # "the" must NOT get IPA
|
||
|
||
def test_fix_continuation_equipment(self):
|
||
"""IPA continuation for 'equipment' → proper IPA."""
|
||
fixed = fix_ipa_continuation_cell(
|
||
"[1uedtX,1]", "equipment (no pl)", pronunciation="british",
|
||
)
|
||
assert fixed != "[1uedtX,1]"
|
||
assert "ɪkwˈɪpmənt" in fixed # equipment IPA
|
||
|
||
def test_fix_continuation_close_down(self):
|
||
"""IPA continuation for 'close sth. down' → IPA for both words."""
|
||
fixed = fix_ipa_continuation_cell(
|
||
"[klaoz 'daun]", "close sth. down", pronunciation="british",
|
||
)
|
||
assert fixed != "[klaoz 'daun]"
|
||
assert "klˈəʊs" in fixed # close IPA
|
||
assert "dˈaʊn" in fixed # down IPA — must NOT be skipped
|
||
|
||
def test_continuation_skips_words_with_inline_ipa(self):
|
||
"""'beat [bˈiːt] , beat, beaten' → continuation only for 'beaten'."""
|
||
fixed = fix_ipa_continuation_cell(
|
||
"[bi:tan]", "beat [bˈiːt] , beat, beaten", pronunciation="british",
|
||
)
|
||
# Should only have IPA for "beaten", NOT for "beat" (already inline)
|
||
assert "bˈiːtən" in fixed
|
||
assert fixed.count("bˈiːt") == 0 or fixed == "[bˈiːtən]"
|
||
|
||
def test_continuation_bracket_at_end_returns_inline(self):
|
||
"""'the Highlands [ˈhaɪləndz]' → return inline IPA, not IPA for 'the'."""
|
||
fixed = fix_ipa_continuation_cell(
|
||
"'hailandz", "the Highlands [ˈhaɪləndz]", pronunciation="british",
|
||
)
|
||
assert fixed == "[ˈhaɪləndz]"
|
||
assert "ðə" not in fixed # "the" must NOT get IPA
|
||
|
||
def test_headword_with_brackets_not_continuation(self):
|
||
"""'employee [im'ploi:]' has a headword outside brackets → not garbled.
|
||
|
||
_text_has_garbled_ipa returns True (has ':'), but Step 5d should
|
||
skip this cell because text doesn't start with '['.
|
||
"""
|
||
# The garbled check still triggers (has IPA-like ':')
|
||
assert _text_has_garbled_ipa("employee [im'ploi:]") is True
|
||
# But text does NOT start with '[' — Step 5d bracket guard blocks it
|
||
text = "employee [im'ploi:]"
|
||
assert not (text.strip().startswith('[') and text.strip().endswith(']'))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _detect_heading_rows_by_single_cell
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestDetectHeadingRowsBySingleCell:
|
||
"""Test heading detection for black single-cell rows (e.g. 'Theme')."""
|
||
|
||
def _make_word_box(self, text, left, top, width, height, color="black"):
|
||
return {
|
||
"text": text, "left": left, "top": top,
|
||
"width": width, "height": height, "color_name": color, "conf": 90,
|
||
}
|
||
|
||
def _make_vocab_zone(self):
|
||
"""Build a typical 4-column vocab zone with 8 rows.
|
||
|
||
Columns: column_1 (page_ref), column_2 (EN), column_3 (DE), column_4 (example)
|
||
Row 4 has only 1 cell in column_2 → heading candidate ("Theme").
|
||
"""
|
||
cells = []
|
||
for ri in range(8):
|
||
if ri == 4:
|
||
# Single-cell row: "Theme" in column_2 only
|
||
cells.append({
|
||
"cell_id": f"Z0_R{ri:02d}_C1",
|
||
"zone_index": 0, "row_index": ri, "col_index": 1,
|
||
"col_type": "column_2", "text": "Theme",
|
||
"word_boxes": [self._make_word_box("Theme", 130, 100 + ri * 30, 70, 20)],
|
||
})
|
||
continue
|
||
# Normal vocab row: 3-4 cells
|
||
cells.append({
|
||
"cell_id": f"Z0_R{ri:02d}_C0",
|
||
"zone_index": 0, "row_index": ri, "col_index": 0,
|
||
"col_type": "column_1", "text": f"p.{70 + ri}",
|
||
"word_boxes": [self._make_word_box(f"p.{70+ri}", 10, 100 + ri * 30, 30, 20)],
|
||
})
|
||
cells.append({
|
||
"cell_id": f"Z0_R{ri:02d}_C1",
|
||
"zone_index": 0, "row_index": ri, "col_index": 1,
|
||
"col_type": "column_2", "text": f"word_{ri}",
|
||
"word_boxes": [self._make_word_box(f"word_{ri}", 130, 100 + ri * 30, 80, 20)],
|
||
})
|
||
cells.append({
|
||
"cell_id": f"Z0_R{ri:02d}_C2",
|
||
"zone_index": 0, "row_index": ri, "col_index": 2,
|
||
"col_type": "column_3", "text": f"Wort_{ri}",
|
||
"word_boxes": [self._make_word_box(f"Wort_{ri}", 400, 100 + ri * 30, 80, 20)],
|
||
})
|
||
cells.append({
|
||
"cell_id": f"Z0_R{ri:02d}_C3",
|
||
"zone_index": 0, "row_index": ri, "col_index": 3,
|
||
"col_type": "column_4", "text": f"Example sentence {ri}.",
|
||
"word_boxes": [self._make_word_box(f"Example", 600, 100 + ri * 30, 120, 20)],
|
||
})
|
||
|
||
rows = [
|
||
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
|
||
for ri in range(8)
|
||
]
|
||
columns = [
|
||
{"col_index": 0, "col_type": "column_1"},
|
||
{"col_index": 1, "col_type": "column_2"},
|
||
{"col_index": 2, "col_type": "column_3"},
|
||
{"col_index": 3, "col_type": "column_4"},
|
||
]
|
||
return {
|
||
"zone_index": 0, "zone_type": "content",
|
||
"bbox_px": {"x": 0, "y": 0, "w": 800, "h": 1000},
|
||
"cells": cells, "rows": rows, "columns": columns,
|
||
}
|
||
|
||
def test_single_cell_heading_detected(self):
|
||
"""Row with only 1 content cell in column_2 → heading."""
|
||
zone = self._make_vocab_zone()
|
||
zones_data = [zone]
|
||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||
assert count == 1
|
||
heading_cells = [c for c in zone["cells"] if c["row_index"] == 4]
|
||
assert len(heading_cells) == 1
|
||
assert heading_cells[0]["col_type"] == "heading"
|
||
assert heading_cells[0]["text"] == "Theme"
|
||
assert heading_cells[0]["col_index"] == 1 # Starts at column_2, not 0
|
||
|
||
def test_single_cell_in_last_column_not_heading(self):
|
||
"""Row with only 1 cell in column_4 (last) → NOT heading (continuation)."""
|
||
zone = self._make_vocab_zone()
|
||
# Add a single-cell row in the last column (column_4)
|
||
zone["cells"].append({
|
||
"cell_id": "Z0_R04_C3",
|
||
"zone_index": 0, "row_index": 4, "col_index": 3,
|
||
"col_type": "column_4", "text": "2. Veränderung",
|
||
"word_boxes": [self._make_word_box("2.", 600, 220, 100, 20)],
|
||
})
|
||
# Remove the "Theme" cell from row 4
|
||
zone["cells"] = [c for c in zone["cells"]
|
||
if not (c["row_index"] == 4 and c["col_index"] == 1)]
|
||
zones_data = [zone]
|
||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||
# Row 4 now only has column_4 → should NOT be heading
|
||
# But original row 4 "Theme" was removed, so no heading at all
|
||
assert count == 0
|
||
|
||
def test_ipa_bracket_text_not_heading(self):
|
||
"""Row with single cell starting with '[' → IPA continuation, not heading."""
|
||
zone = self._make_vocab_zone()
|
||
# Replace "Theme" with IPA continuation
|
||
for c in zone["cells"]:
|
||
if c["row_index"] == 4 and c["col_index"] == 1:
|
||
c["text"] = "[θˈiːm]"
|
||
break
|
||
zones_data = [zone]
|
||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||
assert count == 0
|
||
|
||
def test_multi_cell_row_not_heading(self):
|
||
"""Normal vocab row with multiple cells → NOT heading."""
|
||
zone = self._make_vocab_zone()
|
||
zones_data = [zone]
|
||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||
# Only row 4 (Theme) should be heading, other rows have 3-4 cells
|
||
assert count == 1
|
||
# Verify normal rows are NOT marked as heading
|
||
for ri in [0, 1, 2, 3, 5, 6, 7]:
|
||
row_cells = [c for c in zone["cells"] if c["row_index"] == ri]
|
||
for c in row_cells:
|
||
assert c["col_type"] != "heading"
|
||
|
||
def test_color_heading_preserves_correct_col_index(self):
|
||
"""Color heading starting in column_2 → col_index should be 1, not 0."""
|
||
zone = self._make_vocab_zone()
|
||
# Make row 3 a color heading: blue words in column_2 and column_3 only
|
||
# (no column_1 page_ref for this row)
|
||
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 3]
|
||
zone["cells"].append({
|
||
"cell_id": "Z0_R03_C1", "zone_index": 0, "row_index": 3,
|
||
"col_index": 1, "col_type": "column_2", "text": "Unit 4:",
|
||
"word_boxes": [self._make_word_box("Unit", 130, 190, 50, 26, "blue"),
|
||
self._make_word_box("4:", 185, 190, 20, 26, "blue")],
|
||
})
|
||
zone["cells"].append({
|
||
"cell_id": "Z0_R03_C2", "zone_index": 0, "row_index": 3,
|
||
"col_index": 2, "col_type": "column_3", "text": "Scotland",
|
||
"word_boxes": [self._make_word_box("Scotland", 400, 190, 100, 26, "blue")],
|
||
})
|
||
zones_data = [zone]
|
||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||
assert count == 1
|
||
heading = [c for c in zone["cells"] if c["row_index"] == 3]
|
||
assert len(heading) == 1
|
||
assert heading[0]["col_type"] == "heading"
|
||
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
|
||
|
||
def test_last_row_single_cell_not_heading(self):
|
||
"""Single-cell in last row (e.g. page number '212') → NOT heading."""
|
||
zone = self._make_vocab_zone()
|
||
# Make row 7 (the last) have only 1 cell in column_2
|
||
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
|
||
zone["cells"].append({
|
||
"cell_id": "Z0_R07_C1",
|
||
"zone_index": 0, "row_index": 7, "col_index": 1,
|
||
"col_type": "column_2", "text": "two hundred and twelve",
|
||
"word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
|
||
})
|
||
zones_data = [zone]
|
||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||
# Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
|
||
assert count == 1
|
||
heading_cells = [c for c in zone["cells"]
|
||
if c.get("col_type") == "heading"]
|
||
assert all(c["row_index"] != 7 for c in heading_cells)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Step 5h: Slash-IPA to bracket conversion
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestSlashIpaConversion:
|
||
"""Step 5h converts /ocr_ipa/ patterns to [dictionary_ipa] notation."""
|
||
|
||
def _run_step_5h(self, text: str) -> str:
|
||
"""Run the Step 5h regex logic on a single text string."""
|
||
import re
|
||
from cv_ocr_engines import _lookup_ipa
|
||
|
||
_SLASH_IPA_RE = re.compile(
|
||
r'(\b[a-zA-Z]+[²³¹]?)\s*'
|
||
r"(/[^/]{2,}/)"
|
||
)
|
||
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
||
|
||
def _replace(m):
|
||
headword = m.group(1)
|
||
ocr_ipa = m.group(2)
|
||
inner_raw = ocr_ipa.strip("/").strip()
|
||
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
||
return m.group(0)
|
||
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||
if ipa:
|
||
return f"{headword} [{ipa}]"
|
||
inner = inner_raw.lstrip("'").strip()
|
||
if inner:
|
||
return f"{headword} [{inner}]"
|
||
return m.group(0)
|
||
|
||
new_text = _SLASH_IPA_RE.sub(_replace, text)
|
||
|
||
# Second pass: trailing /ipa/ after [ipa]
|
||
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
||
def _replace_trailing(m):
|
||
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
||
if _SLASH_IPA_REJECT_RE.search(inner):
|
||
return m.group(0)
|
||
if inner:
|
||
return f" [{inner}]"
|
||
return m.group(0)
|
||
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing, new_text)
|
||
|
||
if new_text == text:
|
||
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||
if m:
|
||
inner = m.group(1).strip()
|
||
if not _SLASH_IPA_REJECT_RE.search(inner):
|
||
inner = inner.lstrip("'").strip()
|
||
if inner:
|
||
new_text = "[" + inner + "]" + text[m.end():]
|
||
return new_text
|
||
|
||
def test_tiger_dict_lookup(self):
|
||
"""tiger /'taiga/ → tiger [tˈaɪgə] (from dictionary)."""
|
||
result = self._run_step_5h("tiger /'taiga/ Nomen Tiger")
|
||
assert "[tˈaɪgə]" in result
|
||
assert "/'taiga/" not in result
|
||
assert result.startswith("tiger")
|
||
|
||
def test_tight_no_space(self):
|
||
"""tight²/tait/ → tight² [tˈaɪt] (no space before slash)."""
|
||
result = self._run_step_5h("tight²/tait/ Adv fest")
|
||
assert "[tˈaɪt]" in result
|
||
assert "/tait/" not in result
|
||
|
||
def test_unknown_word_falls_back_to_ocr(self):
|
||
"""tinned/und/ → tinned [und] (not in dictionary, keeps OCR IPA)."""
|
||
result = self._run_step_5h("tinned/und/ Adj Dosen-")
|
||
assert "[und]" in result
|
||
assert "/und/" not in result
|
||
|
||
def test_sb_sth_not_matched(self):
|
||
"""sb/sth should NOT be treated as IPA (contains space/parens)."""
|
||
text = "(tie sb/sth up) jdn/etwas anbinden"
|
||
result = self._run_step_5h(text)
|
||
# The inner content "sth up) jdn" has spaces and parens → rejected
|
||
assert result == text # unchanged
|
||
|
||
def test_double_ipa_both_converted(self):
|
||
"""times/taimz/ /tamz/ → times [tˈaɪmz] [tamz] (both converted)."""
|
||
result = self._run_step_5h("times/taimz/ /tamz/ Präp")
|
||
assert "[tˈaɪmz]" in result
|
||
assert "[tamz]" in result
|
||
assert "/taimz/" not in result
|
||
assert "/tamz/" not in result
|
||
|
||
def test_standalone_slash_ipa_at_start(self):
|
||
"""/tam/ Nomen → [tam] Nomen (no headword in cell)."""
|
||
result = self._run_step_5h("/tam/ Nomen 1 Zeit")
|
||
assert result.startswith("[tam]")
|
||
assert "/tam/" not in result
|
||
|
||
def test_no_slashes_unchanged(self):
|
||
"""Text without slashes passes through unchanged."""
|
||
text = "hello world"
|
||
assert self._run_step_5h(text) == text
|
||
|
||
def test_tile_dict_lookup(self):
|
||
"""tile /tail/ → tile [tˈaɪl]."""
|
||
result = self._run_step_5h("tile /tail/ Nomen Dachziegel")
|
||
assert "[tˈaɪl]" in result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Color detection: red false-positive suppression
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestRedFalsePositiveSuppression:
|
||
"""Red requires median_sat >= 80 to avoid scanner artifact false positives."""
|
||
|
||
def test_low_saturation_red_classified_as_black(self):
|
||
"""Black text with slight warm scanner tint (sat ~60) → black, not red."""
|
||
import numpy as np
|
||
from cv_color_detect import detect_word_colors
|
||
|
||
# Create a 40x20 image with dark gray pixels (slight warm tint)
|
||
# HSV: hue=5 (red range), sat=60 (above 55 threshold but below 80), val=40
|
||
img_hsv = np.full((40, 200, 3), [5, 60, 40], dtype=np.uint8)
|
||
img_bgr = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)
|
||
|
||
wb = [{"left": 10, "top": 5, "width": 50, "height": 20, "text": "test"}]
|
||
detect_word_colors(img_bgr, wb)
|
||
assert wb[0]["color_name"] == "black", \
|
||
f"Expected black, got {wb[0]['color_name']} (scanner artifact false positive)"
|
||
|
||
def test_high_saturation_red_classified_as_red(self):
|
||
"""Genuinely red text (sat=150) → red."""
|
||
import numpy as np
|
||
from cv_color_detect import detect_word_colors
|
||
|
||
# White background with red text region
|
||
# Background: white (H=0, S=0, V=255)
|
||
img_hsv = np.full((40, 200, 3), [0, 0, 255], dtype=np.uint8)
|
||
# Text area: red (H=5, S=180, V=200)
|
||
img_hsv[8:18, 15:55] = [5, 180, 200]
|
||
img_bgr = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)
|
||
|
||
wb = [{"left": 10, "top": 5, "width": 50, "height": 20, "text": "red"}]
|
||
detect_word_colors(img_bgr, wb)
|
||
assert wb[0]["color_name"] == "red", \
|
||
f"Expected red, got {wb[0]['color_name']}"
|