Files
breakpilot-lehrer/klausur-service/backend/tests/test_grid_editor_api.py
Benjamin Admin d889a6959e Fix red false-positive in color detection for scanned black text
Scanner artifacts on black text produce slight warm tint (hue ~0, sat ~60)
that was misclassified as red. Now requires median_sat >= 80 specifically
for red classification, since genuine red text always has high saturation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 17:18:44 +01:00

957 lines
42 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
Covers:
- _merge_content_zones_across_boxes: zone merging logic
- _detect_heading_rows_by_color: heading detection by color + height
- _filter_border_ghosts: single-char ghost detection
- _detect_header_rows: skip_first_row_header flag
"""
import sys
sys.path.insert(0, '/app')
import cv2
import numpy as np
import pytest
from cv_vocab_types import PageZone, DetectedBox
from grid_editor_api import (
_merge_content_zones_across_boxes,
_filter_border_ghosts,
_detect_header_rows,
_detect_heading_rows_by_color,
_detect_heading_rows_by_single_cell,
)
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell
# ---------------------------------------------------------------------------
# _merge_content_zones_across_boxes
# ---------------------------------------------------------------------------
class TestMergeContentZonesAcrossBoxes:
"""Test zone merging across box zones."""
def test_no_merge_when_less_than_3_zones(self):
"""Fewer than 3 zones → no merge possible."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
assert result[0].zone_type == "content"
assert result[1].zone_type == "box"
def test_merge_content_box_content(self):
"""[content, box, content] → [merged_content with overlay]."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 1
merged = result[0]
assert merged.zone_type == "content"
assert merged.y == 0
assert merged.height == 350 # 0 to 350
assert len(merged.image_overlays) == 1
assert merged.image_overlays[0]["y"] == 100
assert merged.image_overlays[0]["height"] == 50
def test_box_at_start_not_merged(self):
"""Box at the start (not between contents) stays separate."""
zones = [
PageZone(index=0, zone_type="box", y=0, height=50, x=50, width=400,
box=DetectedBox(x=50, y=0, width=400, height=50, confidence=0.9)),
PageZone(index=1, zone_type="content", y=50, height=100, x=0, width=500),
PageZone(index=2, zone_type="box", y=150, height=50, x=50, width=400,
box=DetectedBox(x=50, y=150, width=400, height=50, confidence=0.9)),
PageZone(index=3, zone_type="content", y=200, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
# Box at start stays, then content+box+content merges
assert len(result) == 2
assert result[0].zone_type == "box"
assert result[1].zone_type == "content"
assert len(result[1].image_overlays) == 1
def test_consecutive_boxes_not_merged(self):
"""[content, box, box, content] → no merge (consecutive boxes rare in practice)."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="box", y=150, height=30, x=60, width=380,
box=DetectedBox(x=60, y=150, width=380, height=30, confidence=0.8)),
PageZone(index=3, zone_type="content", y=180, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
# Two consecutive boxes: the algorithm only merges [content, box, content]
# pairs, so consecutive boxes break the pattern.
assert len(result) == 4
def test_zone_reindexing(self):
"""Zone indices are re-numbered after merging."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="box", y=100, height=50, x=50, width=400,
box=DetectedBox(x=50, y=100, width=400, height=50, confidence=0.9)),
PageZone(index=2, zone_type="content", y=150, height=200, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert result[0].index == 0
def test_no_boxes_passthrough(self):
"""All-content zones pass through unchanged."""
zones = [
PageZone(index=0, zone_type="content", y=0, height=100, x=0, width=500),
PageZone(index=1, zone_type="content", y=100, height=100, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
def test_typical_vocab_page_pattern(self):
"""Typical pattern: [box(VOCABULARY), content, box(image), content]
→ box stays, content+box+content merges."""
zones = [
PageZone(index=0, zone_type="box", y=10, height=40, x=50, width=400,
box=DetectedBox(x=50, y=10, width=400, height=40, confidence=0.95)),
PageZone(index=1, zone_type="content", y=60, height=50, x=0, width=500),
PageZone(index=2, zone_type="box", y=120, height=85, x=50, width=400,
box=DetectedBox(x=50, y=120, width=400, height=85, confidence=0.8)),
PageZone(index=3, zone_type="content", y=210, height=500, x=0, width=500),
]
result = _merge_content_zones_across_boxes(zones, 0, 500)
assert len(result) == 2
assert result[0].zone_type == "box" # VOCABULARY header box stays
assert result[1].zone_type == "content" # merged content zone
assert result[1].y == 60
assert result[1].height == 710 - 60 # 60 to 710
assert len(result[1].image_overlays) == 1
assert result[1].image_overlays[0]["y"] == 120
# Check reindexing
assert result[0].index == 0
assert result[1].index == 1
# ---------------------------------------------------------------------------
# _detect_heading_rows_by_color
# ---------------------------------------------------------------------------
class TestDetectHeadingRowsByColor:
"""Test heading detection by color + height."""
def _make_word_box(self, text, left, top, width, height, color="black"):
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"color_name": color,
"conf": 90,
}
def _make_zone(self, cells, rows, columns, zone_index=0,
bbox_x=0, bbox_y=0, bbox_w=800, bbox_h=1000):
return {
"zone_index": zone_index,
"zone_type": "content",
"bbox_px": {"x": bbox_x, "y": bbox_y, "w": bbox_w, "h": bbox_h},
"cells": cells,
"rows": rows,
"columns": columns,
}
def test_blue_heading_detected(self):
"""Row with all blue words + taller height → heading."""
# Normal rows: height ~20
normal_cells = []
for ri in range(5):
normal_cells.append({
"cell_id": f"Z0_R{ri:02d}_C0",
"zone_index": 0,
"row_index": ri,
"col_index": 0,
"col_type": "column_1",
"text": f"word_{ri}",
"word_boxes": [
self._make_word_box(f"word_{ri}", 10, 100 + ri * 30, 80, 20),
],
})
normal_cells.append({
"cell_id": f"Z0_R{ri:02d}_C1",
"zone_index": 0,
"row_index": ri,
"col_index": 1,
"col_type": "column_2",
"text": f"translation_{ri}",
"word_boxes": [
self._make_word_box(f"translation_{ri}", 300, 100 + ri * 30, 100, 20),
],
})
# Heading row (index 2): blue, taller (height 25)
heading_ri = 2
for c in normal_cells:
if c["row_index"] == heading_ri:
for wb in c["word_boxes"]:
wb["color_name"] = "blue"
wb["height"] = 25 # > 1.2 * 20 = 24
rows = [
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
for ri in range(5)
]
columns = [
{"index": 0, "label": "column_1"},
{"index": 1, "label": "column_2"},
]
zones_data = [self._make_zone(normal_cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 1
# Check that row 2 is now marked as header
assert rows[2]["is_header"] is True
# Check that the heading cell was created
heading_cells = [c for c in zones_data[0]["cells"] if c["row_index"] == heading_ri]
assert len(heading_cells) == 1
assert heading_cells[0]["col_type"] == "heading"
assert "word_2" in heading_cells[0]["text"]
assert "translation_2" in heading_cells[0]["text"]
def test_black_row_not_heading(self):
"""Row with black words → not a heading, even if tall."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "hello",
"word_boxes": [self._make_word_box("hello", 10, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "world",
"word_boxes": [self._make_word_box("world", 300, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_mixed_color_row_not_heading(self):
"""Row with some blue and some black words → not a heading."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "normal",
"word_boxes": [self._make_word_box("normal", 300, 100, 80, 25, "black")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_colored_but_not_tall_not_heading(self):
"""Row with all blue words but normal height → not a heading."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 20, "blue")],
},
{
"cell_id": "Z0_R00_C1", "zone_index": 0, "row_index": 0,
"col_index": 1, "col_type": "column_2", "text": "four",
"word_boxes": [self._make_word_box("four", 300, 100, 60, 20, "blue")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 120, "is_header": False},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_single_column_zone_skipped(self):
"""Zones with < 2 columns are skipped."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "column_1", "text": "Unit",
"word_boxes": [self._make_word_box("Unit", 10, 100, 60, 25, "blue")],
},
]
rows = [{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": False}]
columns = [{"index": 0, "label": "column_1"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
def test_already_header_skipped(self):
"""Rows already marked is_header are not re-detected."""
cells = [
{
"cell_id": "Z0_R00_C0", "zone_index": 0, "row_index": 0,
"col_index": 0, "col_type": "spanning_header", "text": "Header",
"word_boxes": [self._make_word_box("Header", 10, 100, 60, 25, "blue")],
},
{
"cell_id": "Z0_R01_C0", "zone_index": 0, "row_index": 1,
"col_index": 0, "col_type": "column_1", "text": "foo",
"word_boxes": [self._make_word_box("foo", 10, 130, 80, 20, "black")],
},
{
"cell_id": "Z0_R01_C1", "zone_index": 0, "row_index": 1,
"col_index": 1, "col_type": "column_2", "text": "bar",
"word_boxes": [self._make_word_box("bar", 300, 130, 80, 20, "black")],
},
]
rows = [
{"index": 0, "y_min_px": 100, "y_max_px": 125, "is_header": True},
{"index": 1, "y_min_px": 130, "y_max_px": 150, "is_header": False},
]
columns = [{"index": 0, "label": "column_1"}, {"index": 1, "label": "column_2"}]
zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0
# ---------------------------------------------------------------------------
# _filter_border_ghosts (Fix 2: single-char only)
# ---------------------------------------------------------------------------
class TestFilterBorderGhosts:
"""Test that ghost filtering only removes single-char words."""
def test_single_char_ghost_removed(self):
"""Single '|' on a box border → filtered as ghost."""
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
words = [
{"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
{"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 1
assert len(filtered) == 1
assert filtered[0]["text"] == "hello"
def test_multi_char_ghost_kept(self):
"""Multi-char '(=' on a bordered box → NOT filtered (real content)."""
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3)
words = [
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 0
assert len(filtered) == 2
def test_borderless_box_no_ghost_filter(self):
"""Borderless box (border_thickness=0) → no ghost filtering at all."""
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
words = [
{"text": "I", "left": 643, "top": 272, "width": 6, "height": 19}, # near box edge
{"text": "|", "left": 647, "top": 200, "width": 3, "height": 10}, # even pipe on edge
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 0 # nothing filtered — borderless box
assert len(filtered) == 2
def test_single_paren_on_border_removed(self):
"""Single ')' on border → filtered."""
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
words = [
{"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 1
assert len(filtered) == 0
# ---------------------------------------------------------------------------
# Step 4d: Pipe-character divider filter
# ---------------------------------------------------------------------------
class TestPipeDividerFilter:
"""Step 4d removes '|' word_boxes that are OCR artifacts from column dividers."""
def test_pipe_word_boxes_removed(self):
"""Word boxes with text '|' or '||' are removed from cells."""
zone = {
"zone_index": 0,
"cells": [
{
"cell_id": "Z0_R0_C0",
"text": "hello | world",
"word_boxes": [
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
{"text": "|", "top": 10, "left": 55, "height": 15, "width": 5},
{"text": "world", "top": 10, "left": 65, "height": 15, "width": 40},
],
},
],
"rows": [{"index": 0}],
}
# Simulate Step 4d inline
import re
_PIPE_RE = re.compile(r"^\|+$")
for cell in zone["cells"]:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
assert len(zone["cells"][0]["word_boxes"]) == 2
assert zone["cells"][0]["text"] == "hello world"
def test_pipe_only_cell_removed(self):
"""A cell containing only '|' word_boxes becomes empty and is removed."""
zone = {
"zone_index": 0,
"cells": [
{
"cell_id": "Z0_R0_C0",
"text": "hello",
"word_boxes": [
{"text": "hello", "top": 10, "left": 10, "height": 15, "width": 40},
],
},
{
"cell_id": "Z0_R0_C1",
"text": "|",
"word_boxes": [
{"text": "|", "top": 10, "left": 740, "height": 15, "width": 5},
],
},
],
"rows": [{"index": 0}],
}
import re
_PIPE_RE = re.compile(r"^\|+$")
removed = 0
for cell in zone["cells"]:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = " ".join(
wb.get("text", "").strip()
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
if wb.get("text", "").strip()
)
if removed:
zone["cells"] = [c for c in zone["cells"] if (c.get("word_boxes") or c.get("text", "").strip())]
assert removed == 1
assert len(zone["cells"]) == 1
assert zone["cells"][0]["text"] == "hello"
def test_double_pipe_removed(self):
"""'||' is also treated as a divider artifact."""
import re
_PIPE_RE = re.compile(r"^\|+$")
assert _PIPE_RE.match("||") is not None
assert _PIPE_RE.match("|") is not None
assert _PIPE_RE.match("hello") is None
assert _PIPE_RE.match("|word") is None
# ---------------------------------------------------------------------------
# _detect_header_rows (Fix 3: skip_first_row_header)
# ---------------------------------------------------------------------------
class TestDetectHeaderRowsSkipFlag:
"""Test skip_first_row_header flag."""
def test_first_row_detected_without_flag(self):
"""Without flag, first row with big gap → header."""
rows = [
{"y_min": 100, "y_max": 120, "index": 0},
{"y_min": 160, "y_max": 180, "index": 1},
{"y_min": 185, "y_max": 205, "index": 2},
]
words = [
{"height": 20, "top": 105, "left": 10, "width": 80},
{"height": 20, "top": 165, "left": 10, "width": 80},
{"height": 20, "top": 190, "left": 10, "width": 80},
]
headers = _detect_header_rows(rows, words, 0)
assert 0 in headers
def test_first_row_skipped_with_flag(self):
"""With skip flag, first row NOT detected even with big gap."""
rows = [
{"y_min": 100, "y_max": 120, "index": 0},
{"y_min": 160, "y_max": 180, "index": 1},
{"y_min": 185, "y_max": 205, "index": 2},
]
words = [
{"height": 20, "top": 105, "left": 10, "width": 80},
{"height": 20, "top": 165, "left": 10, "width": 80},
{"height": 20, "top": 190, "left": 10, "width": 80},
]
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
assert 0 not in headers
# ---------------------------------------------------------------------------
# _text_has_garbled_ipa + fix_ipa_continuation_cell
# ---------------------------------------------------------------------------
class TestGarbledIpaDetection:
"""Test detection and fixing of garbled IPA in bracket notation."""
def test_bracket_garbled_no_ipa_chars(self):
"""'[n, nn]' — brackets with no real IPA chars → garbled."""
assert _text_has_garbled_ipa("[n, nn]") is True
def test_bracket_garbled_alphanumeric(self):
"""'[1uedtX,1]' — brackets with digits/letters → garbled."""
assert _text_has_garbled_ipa("[1uedtX,1]") is True
def test_bracket_valid_ipa_detected(self):
"""'[ɪkwˈɪpmənt]' — brackets with real IPA → detected (has IPA chars).
Note: _text_has_garbled_ipa detects IPA-like fragments in text.
Valid IPA also triggers it; callers use a separate check
(re.search for proper IPA brackets) to skip already-correct IPA.
"""
assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is True
def test_no_brackets_normal_word(self):
"""'equipment' — normal word → not garbled."""
assert _text_has_garbled_ipa("equipment") is False
def test_fix_continuation_united_kingdom(self):
"""IPA continuation for 'the United Kingdom' → IPA without 'the'."""
fixed = fix_ipa_continuation_cell(
"[n, nn]", "the United Kingdom", pronunciation="british",
)
# Should contain proper IPA, not the garbled text
assert fixed != "[n, nn]"
assert "kˈɪŋdəm" in fixed # Kingdom IPA
assert "ðə" not in fixed # "the" must NOT get IPA
def test_fix_continuation_equipment(self):
"""IPA continuation for 'equipment' → proper IPA."""
fixed = fix_ipa_continuation_cell(
"[1uedtX,1]", "equipment (no pl)", pronunciation="british",
)
assert fixed != "[1uedtX,1]"
assert "ɪkwˈɪpmənt" in fixed # equipment IPA
def test_fix_continuation_close_down(self):
"""IPA continuation for 'close sth. down' → IPA for both words."""
fixed = fix_ipa_continuation_cell(
"[klaoz 'daun]", "close sth. down", pronunciation="british",
)
assert fixed != "[klaoz 'daun]"
assert "klˈəʊs" in fixed # close IPA
assert "dˈaʊn" in fixed # down IPA — must NOT be skipped
def test_continuation_skips_words_with_inline_ipa(self):
"""'beat [bˈiːt] , beat, beaten' → continuation only for 'beaten'."""
fixed = fix_ipa_continuation_cell(
"[bi:tan]", "beat [bˈiːt] , beat, beaten", pronunciation="british",
)
# Should only have IPA for "beaten", NOT for "beat" (already inline)
assert "bˈiːtən" in fixed
assert fixed.count("bˈiːt") == 0 or fixed == "[bˈiːtən]"
def test_continuation_bracket_at_end_returns_inline(self):
"""'the Highlands [ˈhaɪləndz]' → return inline IPA, not IPA for 'the'."""
fixed = fix_ipa_continuation_cell(
"'hailandz", "the Highlands [ˈhaɪləndz]", pronunciation="british",
)
assert fixed == "[ˈhaɪləndz]"
assert "ðə" not in fixed # "the" must NOT get IPA
def test_headword_with_brackets_not_continuation(self):
"""'employee [im'ploi:]' has a headword outside brackets → not garbled.
_text_has_garbled_ipa returns True (has ':'), but Step 5d should
skip this cell because text doesn't start with '['.
"""
# The garbled check still triggers (has IPA-like ':')
assert _text_has_garbled_ipa("employee [im'ploi:]") is True
# But text does NOT start with '[' — Step 5d bracket guard blocks it
text = "employee [im'ploi:]"
assert not (text.strip().startswith('[') and text.strip().endswith(']'))
# ---------------------------------------------------------------------------
# _detect_heading_rows_by_single_cell
# ---------------------------------------------------------------------------
class TestDetectHeadingRowsBySingleCell:
"""Test heading detection for black single-cell rows (e.g. 'Theme')."""
def _make_word_box(self, text, left, top, width, height, color="black"):
return {
"text": text, "left": left, "top": top,
"width": width, "height": height, "color_name": color, "conf": 90,
}
def _make_vocab_zone(self):
"""Build a typical 4-column vocab zone with 8 rows.
Columns: column_1 (page_ref), column_2 (EN), column_3 (DE), column_4 (example)
Row 4 has only 1 cell in column_2 → heading candidate ("Theme").
"""
cells = []
for ri in range(8):
if ri == 4:
# Single-cell row: "Theme" in column_2 only
cells.append({
"cell_id": f"Z0_R{ri:02d}_C1",
"zone_index": 0, "row_index": ri, "col_index": 1,
"col_type": "column_2", "text": "Theme",
"word_boxes": [self._make_word_box("Theme", 130, 100 + ri * 30, 70, 20)],
})
continue
# Normal vocab row: 3-4 cells
cells.append({
"cell_id": f"Z0_R{ri:02d}_C0",
"zone_index": 0, "row_index": ri, "col_index": 0,
"col_type": "column_1", "text": f"p.{70 + ri}",
"word_boxes": [self._make_word_box(f"p.{70+ri}", 10, 100 + ri * 30, 30, 20)],
})
cells.append({
"cell_id": f"Z0_R{ri:02d}_C1",
"zone_index": 0, "row_index": ri, "col_index": 1,
"col_type": "column_2", "text": f"word_{ri}",
"word_boxes": [self._make_word_box(f"word_{ri}", 130, 100 + ri * 30, 80, 20)],
})
cells.append({
"cell_id": f"Z0_R{ri:02d}_C2",
"zone_index": 0, "row_index": ri, "col_index": 2,
"col_type": "column_3", "text": f"Wort_{ri}",
"word_boxes": [self._make_word_box(f"Wort_{ri}", 400, 100 + ri * 30, 80, 20)],
})
cells.append({
"cell_id": f"Z0_R{ri:02d}_C3",
"zone_index": 0, "row_index": ri, "col_index": 3,
"col_type": "column_4", "text": f"Example sentence {ri}.",
"word_boxes": [self._make_word_box(f"Example", 600, 100 + ri * 30, 120, 20)],
})
rows = [
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
for ri in range(8)
]
columns = [
{"col_index": 0, "col_type": "column_1"},
{"col_index": 1, "col_type": "column_2"},
{"col_index": 2, "col_type": "column_3"},
{"col_index": 3, "col_type": "column_4"},
]
return {
"zone_index": 0, "zone_type": "content",
"bbox_px": {"x": 0, "y": 0, "w": 800, "h": 1000},
"cells": cells, "rows": rows, "columns": columns,
}
def test_single_cell_heading_detected(self):
"""Row with only 1 content cell in column_2 → heading."""
zone = self._make_vocab_zone()
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
assert count == 1
heading_cells = [c for c in zone["cells"] if c["row_index"] == 4]
assert len(heading_cells) == 1
assert heading_cells[0]["col_type"] == "heading"
assert heading_cells[0]["text"] == "Theme"
assert heading_cells[0]["col_index"] == 1 # Starts at column_2, not 0
def test_single_cell_in_last_column_not_heading(self):
"""Row with only 1 cell in column_4 (last) → NOT heading (continuation)."""
zone = self._make_vocab_zone()
# Add a single-cell row in the last column (column_4)
zone["cells"].append({
"cell_id": "Z0_R04_C3",
"zone_index": 0, "row_index": 4, "col_index": 3,
"col_type": "column_4", "text": "2. Veränderung",
"word_boxes": [self._make_word_box("2.", 600, 220, 100, 20)],
})
# Remove the "Theme" cell from row 4
zone["cells"] = [c for c in zone["cells"]
if not (c["row_index"] == 4 and c["col_index"] == 1)]
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
# Row 4 now only has column_4 → should NOT be heading
# But original row 4 "Theme" was removed, so no heading at all
assert count == 0
def test_ipa_bracket_text_not_heading(self):
"""Row with single cell starting with '[' → IPA continuation, not heading."""
zone = self._make_vocab_zone()
# Replace "Theme" with IPA continuation
for c in zone["cells"]:
if c["row_index"] == 4 and c["col_index"] == 1:
c["text"] = "ˈiːm]"
break
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
assert count == 0
def test_multi_cell_row_not_heading(self):
"""Normal vocab row with multiple cells → NOT heading."""
zone = self._make_vocab_zone()
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
# Only row 4 (Theme) should be heading, other rows have 3-4 cells
assert count == 1
# Verify normal rows are NOT marked as heading
for ri in [0, 1, 2, 3, 5, 6, 7]:
row_cells = [c for c in zone["cells"] if c["row_index"] == ri]
for c in row_cells:
assert c["col_type"] != "heading"
def test_color_heading_preserves_correct_col_index(self):
"""Color heading starting in column_2 → col_index should be 1, not 0."""
zone = self._make_vocab_zone()
# Make row 3 a color heading: blue words in column_2 and column_3 only
# (no column_1 page_ref for this row)
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 3]
zone["cells"].append({
"cell_id": "Z0_R03_C1", "zone_index": 0, "row_index": 3,
"col_index": 1, "col_type": "column_2", "text": "Unit 4:",
"word_boxes": [self._make_word_box("Unit", 130, 190, 50, 26, "blue"),
self._make_word_box("4:", 185, 190, 20, 26, "blue")],
})
zone["cells"].append({
"cell_id": "Z0_R03_C2", "zone_index": 0, "row_index": 3,
"col_index": 2, "col_type": "column_3", "text": "Scotland",
"word_boxes": [self._make_word_box("Scotland", 400, 190, 100, 26, "blue")],
})
zones_data = [zone]
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 1
heading = [c for c in zone["cells"] if c["row_index"] == 3]
assert len(heading) == 1
assert heading[0]["col_type"] == "heading"
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
def test_last_row_single_cell_not_heading(self):
"""Single-cell in last row (e.g. page number '212') → NOT heading."""
zone = self._make_vocab_zone()
# Make row 7 (the last) have only 1 cell in column_2
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
zone["cells"].append({
"cell_id": "Z0_R07_C1",
"zone_index": 0, "row_index": 7, "col_index": 1,
"col_type": "column_2", "text": "two hundred and twelve",
"word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
})
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
# Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
assert count == 1
heading_cells = [c for c in zone["cells"]
if c.get("col_type") == "heading"]
assert all(c["row_index"] != 7 for c in heading_cells)
# ---------------------------------------------------------------------------
# Step 5h: Slash-IPA to bracket conversion
# ---------------------------------------------------------------------------
class TestSlashIpaConversion:
"""Step 5h converts /ocr_ipa/ patterns to [dictionary_ipa] notation."""
def _run_step_5h(self, text: str) -> str:
"""Run the Step 5h regex logic on a single text string."""
import re
from cv_ocr_engines import _lookup_ipa
_SLASH_IPA_RE = re.compile(
r'(\b[a-zA-Z]+[²³¹]?)\s*'
r"(/[^/]{2,}/)"
)
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
def _replace(m):
headword = m.group(1)
ocr_ipa = m.group(2)
inner_raw = ocr_ipa.strip("/").strip()
if _SLASH_IPA_REJECT_RE.search(inner_raw):
return m.group(0)
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
if ipa:
return f"{headword} [{ipa}]"
inner = inner_raw.lstrip("'").strip()
if inner:
return f"{headword} [{inner}]"
return m.group(0)
new_text = _SLASH_IPA_RE.sub(_replace, text)
# Second pass: trailing /ipa/ after [ipa]
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
def _replace_trailing(m):
inner = m.group(1).strip("/").strip().lstrip("'").strip()
if _SLASH_IPA_REJECT_RE.search(inner):
return m.group(0)
if inner:
return f" [{inner}]"
return m.group(0)
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing, new_text)
if new_text == text:
m = _STANDALONE_SLASH_IPA_RE.match(text)
if m:
inner = m.group(1).strip()
if not _SLASH_IPA_REJECT_RE.search(inner):
inner = inner.lstrip("'").strip()
if inner:
new_text = "[" + inner + "]" + text[m.end():]
return new_text
def test_tiger_dict_lookup(self):
"""tiger /'taiga/ → tiger [tˈaɪgə] (from dictionary)."""
result = self._run_step_5h("tiger /'taiga/ Nomen Tiger")
assert "[tˈaɪgə]" in result
assert "/'taiga/" not in result
assert result.startswith("tiger")
def test_tight_no_space(self):
"""tight²/tait/ → tight² [tˈaɪt] (no space before slash)."""
result = self._run_step_5h("tight²/tait/ Adv fest")
assert "[tˈaɪt]" in result
assert "/tait/" not in result
def test_unknown_word_falls_back_to_ocr(self):
"""tinned/und/ → tinned [und] (not in dictionary, keeps OCR IPA)."""
result = self._run_step_5h("tinned/und/ Adj Dosen-")
assert "[und]" in result
assert "/und/" not in result
def test_sb_sth_not_matched(self):
"""sb/sth should NOT be treated as IPA (contains space/parens)."""
text = "(tie sb/sth up) jdn/etwas anbinden"
result = self._run_step_5h(text)
# The inner content "sth up) jdn" has spaces and parens → rejected
assert result == text # unchanged
def test_double_ipa_both_converted(self):
"""times/taimz/ /tamz/ → times [tˈaɪmz] [tamz] (both converted)."""
result = self._run_step_5h("times/taimz/ /tamz/ Präp")
assert "[tˈaɪmz]" in result
assert "[tamz]" in result
assert "/taimz/" not in result
assert "/tamz/" not in result
def test_standalone_slash_ipa_at_start(self):
"""/tam/ Nomen → [tam] Nomen (no headword in cell)."""
result = self._run_step_5h("/tam/ Nomen 1 Zeit")
assert result.startswith("[tam]")
assert "/tam/" not in result
def test_no_slashes_unchanged(self):
"""Text without slashes passes through unchanged."""
text = "hello world"
assert self._run_step_5h(text) == text
def test_tile_dict_lookup(self):
"""tile /tail/ → tile [tˈaɪl]."""
result = self._run_step_5h("tile /tail/ Nomen Dachziegel")
assert "[tˈaɪl]" in result
# ---------------------------------------------------------------------------
# Color detection: red false-positive suppression
# ---------------------------------------------------------------------------
class TestRedFalsePositiveSuppression:
"""Red requires median_sat >= 80 to avoid scanner artifact false positives."""
def test_low_saturation_red_classified_as_black(self):
"""Black text with slight warm scanner tint (sat ~60) → black, not red."""
import numpy as np
from cv_color_detect import detect_word_colors
# Create a 40x20 image with dark gray pixels (slight warm tint)
# HSV: hue=5 (red range), sat=60 (above 55 threshold but below 80), val=40
img_hsv = np.full((40, 200, 3), [5, 60, 40], dtype=np.uint8)
img_bgr = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)
wb = [{"left": 10, "top": 5, "width": 50, "height": 20, "text": "test"}]
detect_word_colors(img_bgr, wb)
assert wb[0]["color_name"] == "black", \
f"Expected black, got {wb[0]['color_name']} (scanner artifact false positive)"
def test_high_saturation_red_classified_as_red(self):
"""Genuinely red text (sat=150) → red."""
import numpy as np
from cv_color_detect import detect_word_colors
# White background with red text region
# Background: white (H=0, S=0, V=255)
img_hsv = np.full((40, 200, 3), [0, 0, 255], dtype=np.uint8)
# Text area: red (H=5, S=180, V=200)
img_hsv[8:18, 15:55] = [5, 180, 200]
img_bgr = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)
wb = [{"left": 10, "top": 5, "width": 50, "height": 20, "text": "red"}]
detect_word_colors(img_bgr, wb)
assert wb[0]["color_name"] == "red", \
f"Expected red, got {wb[0]['color_name']}"