Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 50s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m35s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 34s
The next-row word "künden," had a trailing comma, causing dictionary lookup to fail for "verkünden,". Now strips .,;:!? before joining. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
340 lines
12 KiB
Python
340 lines
12 KiB
Python
"""Tests for cv_gutter_repair: gutter-edge word detection and repair."""
|
|
|
|
import pytest
|
|
import sys
|
|
import os
|
|
|
|
# Add parent directory to path so we can import the module
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
|
|
from cv_gutter_repair import (
|
|
_is_known,
|
|
_try_hyphen_join,
|
|
_try_spell_fix,
|
|
_edit_distance,
|
|
_word_is_at_gutter_edge,
|
|
_MIN_WORD_LEN_SPELL,
|
|
_MIN_WORD_LEN_HYPHEN,
|
|
analyse_grid_for_gutter_repair,
|
|
apply_gutter_suggestions,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helper function tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEditDistance:
|
|
def test_identical(self):
|
|
assert _edit_distance("hello", "hello") == 0
|
|
|
|
def test_one_substitution(self):
|
|
assert _edit_distance("stammeli", "stammeln") == 1
|
|
|
|
def test_one_deletion(self):
|
|
assert _edit_distance("cat", "ca") == 1
|
|
|
|
def test_one_insertion(self):
|
|
assert _edit_distance("ca", "cat") == 1
|
|
|
|
def test_empty(self):
|
|
assert _edit_distance("", "abc") == 3
|
|
assert _edit_distance("abc", "") == 3
|
|
|
|
def test_both_empty(self):
|
|
assert _edit_distance("", "") == 0
|
|
|
|
|
|
class TestWordIsAtGutterEdge:
|
|
def test_word_at_right_edge(self):
|
|
# Word right edge at 90% of column = within gutter zone
|
|
word_bbox = {"left": 80, "width": 15} # right edge = 95
|
|
assert _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100)
|
|
|
|
def test_word_in_middle(self):
|
|
# Word right edge at 50% of column = NOT at gutter
|
|
word_bbox = {"left": 30, "width": 20} # right edge = 50
|
|
assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100)
|
|
|
|
def test_word_at_left(self):
|
|
word_bbox = {"left": 5, "width": 20} # right edge = 25
|
|
assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100)
|
|
|
|
def test_zero_width_column(self):
|
|
word_bbox = {"left": 0, "width": 10}
|
|
assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=0)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Spellchecker-dependent tests (skip if not installed)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
try:
|
|
from spellchecker import SpellChecker
|
|
_HAS_SPELLCHECKER = True
|
|
except ImportError:
|
|
_HAS_SPELLCHECKER = False
|
|
|
|
needs_spellchecker = pytest.mark.skipif(
|
|
not _HAS_SPELLCHECKER, reason="pyspellchecker not installed"
|
|
)
|
|
|
|
|
|
@needs_spellchecker
|
|
class TestIsKnown:
|
|
def test_known_english(self):
|
|
assert _is_known("hello") is True
|
|
assert _is_known("world") is True
|
|
|
|
def test_known_german(self):
|
|
assert _is_known("verkünden") is True
|
|
assert _is_known("stammeln") is True
|
|
|
|
def test_unknown_garbled(self):
|
|
assert _is_known("stammeli") is False
|
|
assert _is_known("xyzqwp") is False
|
|
|
|
def test_short_word(self):
|
|
# Words < 3 chars are not checked
|
|
assert _is_known("a") is False
|
|
|
|
|
|
@needs_spellchecker
|
|
class TestTryHyphenJoin:
|
|
def test_direct_join(self):
|
|
# "ver" + "künden" = "verkünden"
|
|
result = _try_hyphen_join("ver-", "künden")
|
|
assert result is not None
|
|
joined, missing, conf = result
|
|
assert joined == "verkünden"
|
|
assert missing == ""
|
|
assert conf >= 0.9
|
|
|
|
def test_join_with_missing_chars(self):
|
|
# "ve" + "künden" → needs "r" in between → "verkünden"
|
|
result = _try_hyphen_join("ve", "künden", max_missing=2)
|
|
assert result is not None
|
|
joined, missing, conf = result
|
|
assert joined == "verkünden"
|
|
assert "r" in missing
|
|
|
|
def test_no_valid_join(self):
|
|
result = _try_hyphen_join("xyz", "qwpgh")
|
|
assert result is None
|
|
|
|
def test_empty_inputs(self):
|
|
assert _try_hyphen_join("", "word") is None
|
|
assert _try_hyphen_join("word", "") is None
|
|
|
|
def test_join_strips_trailing_punctuation(self):
|
|
# "ver" + "künden," → should still find "verkünden" despite comma
|
|
result = _try_hyphen_join("ver-", "künden,")
|
|
assert result is not None
|
|
joined, missing, conf = result
|
|
assert joined == "verkünden"
|
|
|
|
def test_join_with_missing_chars_and_punctuation(self):
|
|
# "ve" + "künden," → needs "r" in between, comma must be stripped
|
|
result = _try_hyphen_join("ve", "künden,", max_missing=2)
|
|
assert result is not None
|
|
joined, missing, conf = result
|
|
assert joined == "verkünden"
|
|
assert "r" in missing
|
|
|
|
|
|
@needs_spellchecker
|
|
class TestTrySpellFix:
|
|
def test_fix_garbled_ending_returns_alternatives(self):
|
|
# "stammeli" should return a correction with alternatives
|
|
result = _try_spell_fix("stammeli", col_type="column_de")
|
|
assert result is not None
|
|
corrected, conf, alts = result
|
|
# The best correction is one of the valid forms
|
|
all_options = [corrected] + alts
|
|
all_lower = [w.lower() for w in all_options]
|
|
# "stammeln" must be among the candidates
|
|
assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}"
|
|
|
|
def test_known_word_not_fixed(self):
|
|
# "Haus" is correct — no fix needed
|
|
result = _try_spell_fix("Haus", col_type="column_de")
|
|
# Should be None since the word is correct
|
|
if result is not None:
|
|
corrected, _, _ = result
|
|
assert corrected.lower() == "haus"
|
|
|
|
def test_short_word_skipped(self):
|
|
result = _try_spell_fix("ab")
|
|
assert result is None
|
|
|
|
def test_min_word_len_thresholds(self):
|
|
assert _MIN_WORD_LEN_HYPHEN == 2
|
|
assert _MIN_WORD_LEN_SPELL == 3
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Grid analysis tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_grid(cells, columns=None):
|
|
"""Helper to create a minimal grid_data structure."""
|
|
if columns is None:
|
|
columns = [
|
|
{"index": 0, "type": "column_en", "x_min_px": 0, "x_max_px": 200},
|
|
{"index": 1, "type": "column_de", "x_min_px": 200, "x_max_px": 400},
|
|
{"index": 2, "type": "column_text", "x_min_px": 400, "x_max_px": 600},
|
|
]
|
|
return {
|
|
"image_width": 600,
|
|
"image_height": 800,
|
|
"zones": [{
|
|
"columns": columns,
|
|
"cells": cells,
|
|
}],
|
|
}
|
|
|
|
|
|
def _make_cell(row, col, text, left=0, width=50, col_width=200, col_x=0):
|
|
"""Helper to create a cell dict with word_boxes at a specific position."""
|
|
return {
|
|
"cell_id": f"R{row:02d}_C{col}",
|
|
"row_index": row,
|
|
"col_index": col,
|
|
"col_type": "column_text",
|
|
"text": text,
|
|
"confidence": 90.0,
|
|
"bbox_px": {"x": left, "y": row * 25, "w": width, "h": 20},
|
|
"word_boxes": [
|
|
{"text": text, "left": left, "top": row * 25, "width": width, "height": 20, "conf": 90},
|
|
],
|
|
}
|
|
|
|
|
|
@needs_spellchecker
|
|
class TestAnalyseGrid:
|
|
def test_empty_grid(self):
|
|
result = analyse_grid_for_gutter_repair({"zones": []})
|
|
assert result["suggestions"] == []
|
|
assert result["stats"]["words_checked"] == 0
|
|
|
|
def test_detects_spell_fix_at_edge(self):
|
|
# "stammeli" at position 160 in a column 0-200 wide = 80% = at gutter
|
|
cells = [
|
|
_make_cell(29, 2, "stammeli", left=540, width=55, col_width=200, col_x=400),
|
|
]
|
|
grid = _make_grid(cells)
|
|
result = analyse_grid_for_gutter_repair(grid)
|
|
suggestions = result["suggestions"]
|
|
assert len(suggestions) >= 1
|
|
assert suggestions[0]["type"] == "spell_fix"
|
|
assert suggestions[0]["suggested_text"] == "stammeln"
|
|
|
|
def test_detects_hyphen_join(self):
|
|
# Row 30: "ve" at gutter edge, Row 31: "künden"
|
|
cells = [
|
|
_make_cell(30, 2, "ve", left=570, width=25, col_width=200, col_x=400),
|
|
_make_cell(31, 2, "künden", left=410, width=80, col_width=200, col_x=400),
|
|
]
|
|
grid = _make_grid(cells)
|
|
result = analyse_grid_for_gutter_repair(grid)
|
|
suggestions = result["suggestions"]
|
|
# Should find hyphen_join or spell_fix
|
|
assert len(suggestions) >= 1
|
|
|
|
def test_ignores_known_words(self):
|
|
# "hello" is a known word — should not be suggested
|
|
cells = [
|
|
_make_cell(0, 0, "hello", left=160, width=35),
|
|
]
|
|
grid = _make_grid(cells)
|
|
result = analyse_grid_for_gutter_repair(grid)
|
|
# Should not suggest anything for known words
|
|
spell_fixes = [s for s in result["suggestions"] if s["original_text"] == "hello"]
|
|
assert len(spell_fixes) == 0
|
|
|
|
def test_ignores_words_not_at_edge(self):
|
|
# "stammeli" at position 10 = NOT at gutter edge
|
|
cells = [
|
|
_make_cell(0, 0, "stammeli", left=10, width=50),
|
|
]
|
|
grid = _make_grid(cells)
|
|
result = analyse_grid_for_gutter_repair(grid)
|
|
assert len(result["suggestions"]) == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Apply suggestions tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestApplySuggestions:
|
|
def test_apply_spell_fix(self):
|
|
cells = [
|
|
{"cell_id": "R29_C2", "row_index": 29, "col_index": 2,
|
|
"text": "er stammeli", "word_boxes": []},
|
|
]
|
|
grid = _make_grid(cells)
|
|
suggestions = [{
|
|
"id": "abc",
|
|
"type": "spell_fix",
|
|
"zone_index": 0,
|
|
"row_index": 29,
|
|
"col_index": 2,
|
|
"original_text": "stammeli",
|
|
"suggested_text": "stammeln",
|
|
}]
|
|
result = apply_gutter_suggestions(grid, ["abc"], suggestions)
|
|
assert result["applied_count"] == 1
|
|
assert grid["zones"][0]["cells"][0]["text"] == "er stammeln"
|
|
|
|
def test_apply_hyphen_join(self):
|
|
cells = [
|
|
{"cell_id": "R30_C2", "row_index": 30, "col_index": 2,
|
|
"text": "ve", "word_boxes": []},
|
|
{"cell_id": "R31_C2", "row_index": 31, "col_index": 2,
|
|
"text": "künden und", "word_boxes": []},
|
|
]
|
|
grid = _make_grid(cells)
|
|
suggestions = [{
|
|
"id": "def",
|
|
"type": "hyphen_join",
|
|
"zone_index": 0,
|
|
"row_index": 30,
|
|
"col_index": 2,
|
|
"original_text": "ve",
|
|
"suggested_text": "verkünden",
|
|
"next_row_index": 31,
|
|
"display_parts": ["ver-", "künden"],
|
|
"missing_chars": "r",
|
|
}]
|
|
result = apply_gutter_suggestions(grid, ["def"], suggestions)
|
|
assert result["applied_count"] == 1
|
|
# Current row: "ve" replaced with "ver-"
|
|
assert grid["zones"][0]["cells"][0]["text"] == "ver-"
|
|
# Next row: "künden" removed, "und" remains
|
|
assert grid["zones"][0]["cells"][1]["text"] == "und"
|
|
|
|
def test_apply_nothing_when_no_accepted(self):
|
|
grid = _make_grid([])
|
|
result = apply_gutter_suggestions(grid, [], [])
|
|
assert result["applied_count"] == 0
|
|
|
|
def test_skip_unknown_suggestion_id(self):
|
|
cells = [
|
|
{"cell_id": "R0_C0", "row_index": 0, "col_index": 0,
|
|
"text": "test", "word_boxes": []},
|
|
]
|
|
grid = _make_grid(cells)
|
|
suggestions = [{
|
|
"id": "abc",
|
|
"type": "spell_fix",
|
|
"zone_index": 0,
|
|
"row_index": 0,
|
|
"col_index": 0,
|
|
"original_text": "test",
|
|
"suggested_text": "test2",
|
|
}]
|
|
# Accept a non-existent ID
|
|
result = apply_gutter_suggestions(grid, ["nonexistent"], suggestions)
|
|
assert result["applied_count"] == 0
|
|
assert grid["zones"][0]["cells"][0]["text"] == "test"
|