"""Tests for cv_gutter_repair: gutter-edge word detection and repair.""" import pytest import sys import os # Add parent directory to path so we can import the module sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) from cv_gutter_repair import ( _is_known, _try_hyphen_join, _try_spell_fix, _edit_distance, _word_is_at_gutter_edge, _MIN_WORD_LEN_SPELL, _MIN_WORD_LEN_HYPHEN, analyse_grid_for_gutter_repair, apply_gutter_suggestions, ) # --------------------------------------------------------------------------- # Helper function tests # --------------------------------------------------------------------------- class TestEditDistance: def test_identical(self): assert _edit_distance("hello", "hello") == 0 def test_one_substitution(self): assert _edit_distance("stammeli", "stammeln") == 1 def test_one_deletion(self): assert _edit_distance("cat", "ca") == 1 def test_one_insertion(self): assert _edit_distance("ca", "cat") == 1 def test_empty(self): assert _edit_distance("", "abc") == 3 assert _edit_distance("abc", "") == 3 def test_both_empty(self): assert _edit_distance("", "") == 0 class TestWordIsAtGutterEdge: def test_word_at_right_edge(self): # Word right edge at 90% of column = within gutter zone word_bbox = {"left": 80, "width": 15} # right edge = 95 assert _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100) def test_word_in_middle(self): # Word right edge at 50% of column = NOT at gutter word_bbox = {"left": 30, "width": 20} # right edge = 50 assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100) def test_word_at_left(self): word_bbox = {"left": 5, "width": 20} # right edge = 25 assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100) def test_zero_width_column(self): word_bbox = {"left": 0, "width": 10} assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=0) # --------------------------------------------------------------------------- # Spellchecker-dependent tests (skip if not installed) # --------------------------------------------------------------------------- try: from spellchecker import SpellChecker _HAS_SPELLCHECKER = True except ImportError: _HAS_SPELLCHECKER = False needs_spellchecker = pytest.mark.skipif( not _HAS_SPELLCHECKER, reason="pyspellchecker not installed" ) @needs_spellchecker class TestIsKnown: def test_known_english(self): assert _is_known("hello") is True assert _is_known("world") is True def test_known_german(self): assert _is_known("verkünden") is True assert _is_known("stammeln") is True def test_unknown_garbled(self): assert _is_known("stammeli") is False assert _is_known("xyzqwp") is False def test_short_word(self): # Words < 3 chars are not checked assert _is_known("a") is False @needs_spellchecker class TestTryHyphenJoin: def test_direct_join(self): # "ver" + "künden" = "verkünden" result = _try_hyphen_join("ver-", "künden") assert result is not None joined, missing, conf = result assert joined == "verkünden" assert missing == "" assert conf >= 0.9 def test_join_with_missing_chars(self): # "ve" + "künden" → needs "r" in between → "verkünden" result = _try_hyphen_join("ve", "künden", max_missing=2) assert result is not None joined, missing, conf = result assert joined == "verkünden" assert "r" in missing def test_no_valid_join(self): result = _try_hyphen_join("xyz", "qwpgh") assert result is None def test_empty_inputs(self): assert _try_hyphen_join("", "word") is None assert _try_hyphen_join("word", "") is None def test_join_strips_trailing_punctuation(self): # "ver" + "künden," → should still find "verkünden" despite comma result = _try_hyphen_join("ver-", "künden,") assert result is not None joined, missing, conf = result assert joined == "verkünden" def test_join_with_missing_chars_and_punctuation(self): # "ve" + "künden," → needs "r" in between, comma must be stripped result = _try_hyphen_join("ve", "künden,", max_missing=2) assert result is not None joined, missing, conf = result assert joined == "verkünden" assert "r" in missing @needs_spellchecker class TestTrySpellFix: def test_fix_garbled_ending_returns_alternatives(self): # "stammeli" should return a correction with alternatives result = _try_spell_fix("stammeli", col_type="column_de") assert result is not None corrected, conf, alts = result # The best correction is one of the valid forms all_options = [corrected] + alts all_lower = [w.lower() for w in all_options] # "stammeln" must be among the candidates assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}" def test_known_word_not_fixed(self): # "Haus" is correct — no fix needed result = _try_spell_fix("Haus", col_type="column_de") # Should be None since the word is correct if result is not None: corrected, _, _ = result assert corrected.lower() == "haus" def test_short_word_skipped(self): result = _try_spell_fix("ab") assert result is None def test_min_word_len_thresholds(self): assert _MIN_WORD_LEN_HYPHEN == 2 assert _MIN_WORD_LEN_SPELL == 3 # --------------------------------------------------------------------------- # Grid analysis tests # --------------------------------------------------------------------------- def _make_grid(cells, columns=None): """Helper to create a minimal grid_data structure.""" if columns is None: columns = [ {"index": 0, "type": "column_en", "x_min_px": 0, "x_max_px": 200}, {"index": 1, "type": "column_de", "x_min_px": 200, "x_max_px": 400}, {"index": 2, "type": "column_text", "x_min_px": 400, "x_max_px": 600}, ] return { "image_width": 600, "image_height": 800, "zones": [{ "columns": columns, "cells": cells, }], } def _make_cell(row, col, text, left=0, width=50, col_width=200, col_x=0): """Helper to create a cell dict with word_boxes at a specific position.""" return { "cell_id": f"R{row:02d}_C{col}", "row_index": row, "col_index": col, "col_type": "column_text", "text": text, "confidence": 90.0, "bbox_px": {"x": left, "y": row * 25, "w": width, "h": 20}, "word_boxes": [ {"text": text, "left": left, "top": row * 25, "width": width, "height": 20, "conf": 90}, ], } @needs_spellchecker class TestAnalyseGrid: def test_empty_grid(self): result = analyse_grid_for_gutter_repair({"zones": []}) assert result["suggestions"] == [] assert result["stats"]["words_checked"] == 0 def test_detects_spell_fix_at_edge(self): # "stammeli" at position 160 in a column 0-200 wide = 80% = at gutter cells = [ _make_cell(29, 2, "stammeli", left=540, width=55, col_width=200, col_x=400), ] grid = _make_grid(cells) result = analyse_grid_for_gutter_repair(grid) suggestions = result["suggestions"] assert len(suggestions) >= 1 assert suggestions[0]["type"] == "spell_fix" assert suggestions[0]["suggested_text"] == "stammeln" def test_detects_hyphen_join(self): # Row 30: "ve" at gutter edge, Row 31: "künden" cells = [ _make_cell(30, 2, "ve", left=570, width=25, col_width=200, col_x=400), _make_cell(31, 2, "künden", left=410, width=80, col_width=200, col_x=400), ] grid = _make_grid(cells) result = analyse_grid_for_gutter_repair(grid) suggestions = result["suggestions"] # Should find hyphen_join or spell_fix assert len(suggestions) >= 1 def test_ignores_known_words(self): # "hello" is a known word — should not be suggested cells = [ _make_cell(0, 0, "hello", left=160, width=35), ] grid = _make_grid(cells) result = analyse_grid_for_gutter_repair(grid) # Should not suggest anything for known words spell_fixes = [s for s in result["suggestions"] if s["original_text"] == "hello"] assert len(spell_fixes) == 0 def test_ignores_words_not_at_edge(self): # "stammeli" at position 10 = NOT at gutter edge cells = [ _make_cell(0, 0, "stammeli", left=10, width=50), ] grid = _make_grid(cells) result = analyse_grid_for_gutter_repair(grid) assert len(result["suggestions"]) == 0 # --------------------------------------------------------------------------- # Apply suggestions tests # --------------------------------------------------------------------------- class TestApplySuggestions: def test_apply_spell_fix(self): cells = [ {"cell_id": "R29_C2", "row_index": 29, "col_index": 2, "text": "er stammeli", "word_boxes": []}, ] grid = _make_grid(cells) suggestions = [{ "id": "abc", "type": "spell_fix", "zone_index": 0, "row_index": 29, "col_index": 2, "original_text": "stammeli", "suggested_text": "stammeln", }] result = apply_gutter_suggestions(grid, ["abc"], suggestions) assert result["applied_count"] == 1 assert grid["zones"][0]["cells"][0]["text"] == "er stammeln" def test_apply_hyphen_join(self): cells = [ {"cell_id": "R30_C2", "row_index": 30, "col_index": 2, "text": "ve", "word_boxes": []}, {"cell_id": "R31_C2", "row_index": 31, "col_index": 2, "text": "künden und", "word_boxes": []}, ] grid = _make_grid(cells) suggestions = [{ "id": "def", "type": "hyphen_join", "zone_index": 0, "row_index": 30, "col_index": 2, "original_text": "ve", "suggested_text": "verkünden", "next_row_index": 31, "display_parts": ["ver-", "künden"], "missing_chars": "r", }] result = apply_gutter_suggestions(grid, ["def"], suggestions) assert result["applied_count"] == 1 # Current row: "ve" replaced with "ver-" assert grid["zones"][0]["cells"][0]["text"] == "ver-" # Next row: "künden" removed, "und" remains assert grid["zones"][0]["cells"][1]["text"] == "und" def test_apply_nothing_when_no_accepted(self): grid = _make_grid([]) result = apply_gutter_suggestions(grid, [], []) assert result["applied_count"] == 0 def test_skip_unknown_suggestion_id(self): cells = [ {"cell_id": "R0_C0", "row_index": 0, "col_index": 0, "text": "test", "word_boxes": []}, ] grid = _make_grid(cells) suggestions = [{ "id": "abc", "type": "spell_fix", "zone_index": 0, "row_index": 0, "col_index": 0, "original_text": "test", "suggested_text": "test2", }] # Accept a non-existent ID result = apply_gutter_suggestions(grid, ["nonexistent"], suggestions) assert result["applied_count"] == 0 assert grid["zones"][0]["cells"][0]["text"] == "test"