Files
breakpilot-lehrer/klausur-service/backend/tests/test_gutter_repair.py
Benjamin Admin aabd849e35
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 50s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m35s
CI / test-python-agent-core (push) Successful in 31s
CI / test-nodejs-website (push) Successful in 34s
Fix hyphen-join: strip trailing punctuation from continuation word
The next-row word "künden," had a trailing comma, causing dictionary
lookup to fail for "verkünden,". Now strips .,;:!? before joining.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 19:25:28 +02:00

340 lines
12 KiB
Python

"""Tests for cv_gutter_repair: gutter-edge word detection and repair."""
import pytest
import sys
import os
# Add parent directory to path so we can import the module
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from cv_gutter_repair import (
_is_known,
_try_hyphen_join,
_try_spell_fix,
_edit_distance,
_word_is_at_gutter_edge,
_MIN_WORD_LEN_SPELL,
_MIN_WORD_LEN_HYPHEN,
analyse_grid_for_gutter_repair,
apply_gutter_suggestions,
)
# ---------------------------------------------------------------------------
# Helper function tests
# ---------------------------------------------------------------------------
class TestEditDistance:
def test_identical(self):
assert _edit_distance("hello", "hello") == 0
def test_one_substitution(self):
assert _edit_distance("stammeli", "stammeln") == 1
def test_one_deletion(self):
assert _edit_distance("cat", "ca") == 1
def test_one_insertion(self):
assert _edit_distance("ca", "cat") == 1
def test_empty(self):
assert _edit_distance("", "abc") == 3
assert _edit_distance("abc", "") == 3
def test_both_empty(self):
assert _edit_distance("", "") == 0
class TestWordIsAtGutterEdge:
def test_word_at_right_edge(self):
# Word right edge at 90% of column = within gutter zone
word_bbox = {"left": 80, "width": 15} # right edge = 95
assert _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100)
def test_word_in_middle(self):
# Word right edge at 50% of column = NOT at gutter
word_bbox = {"left": 30, "width": 20} # right edge = 50
assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100)
def test_word_at_left(self):
word_bbox = {"left": 5, "width": 20} # right edge = 25
assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=100)
def test_zero_width_column(self):
word_bbox = {"left": 0, "width": 10}
assert not _word_is_at_gutter_edge(word_bbox, col_x=0, col_width=0)
# ---------------------------------------------------------------------------
# Spellchecker-dependent tests (skip if not installed)
# ---------------------------------------------------------------------------
try:
from spellchecker import SpellChecker
_HAS_SPELLCHECKER = True
except ImportError:
_HAS_SPELLCHECKER = False
needs_spellchecker = pytest.mark.skipif(
not _HAS_SPELLCHECKER, reason="pyspellchecker not installed"
)
@needs_spellchecker
class TestIsKnown:
def test_known_english(self):
assert _is_known("hello") is True
assert _is_known("world") is True
def test_known_german(self):
assert _is_known("verkünden") is True
assert _is_known("stammeln") is True
def test_unknown_garbled(self):
assert _is_known("stammeli") is False
assert _is_known("xyzqwp") is False
def test_short_word(self):
# Words < 3 chars are not checked
assert _is_known("a") is False
@needs_spellchecker
class TestTryHyphenJoin:
def test_direct_join(self):
# "ver" + "künden" = "verkünden"
result = _try_hyphen_join("ver-", "künden")
assert result is not None
joined, missing, conf = result
assert joined == "verkünden"
assert missing == ""
assert conf >= 0.9
def test_join_with_missing_chars(self):
# "ve" + "künden" → needs "r" in between → "verkünden"
result = _try_hyphen_join("ve", "künden", max_missing=2)
assert result is not None
joined, missing, conf = result
assert joined == "verkünden"
assert "r" in missing
def test_no_valid_join(self):
result = _try_hyphen_join("xyz", "qwpgh")
assert result is None
def test_empty_inputs(self):
assert _try_hyphen_join("", "word") is None
assert _try_hyphen_join("word", "") is None
def test_join_strips_trailing_punctuation(self):
# "ver" + "künden," → should still find "verkünden" despite comma
result = _try_hyphen_join("ver-", "künden,")
assert result is not None
joined, missing, conf = result
assert joined == "verkünden"
def test_join_with_missing_chars_and_punctuation(self):
# "ve" + "künden," → needs "r" in between, comma must be stripped
result = _try_hyphen_join("ve", "künden,", max_missing=2)
assert result is not None
joined, missing, conf = result
assert joined == "verkünden"
assert "r" in missing
@needs_spellchecker
class TestTrySpellFix:
def test_fix_garbled_ending_returns_alternatives(self):
# "stammeli" should return a correction with alternatives
result = _try_spell_fix("stammeli", col_type="column_de")
assert result is not None
corrected, conf, alts = result
# The best correction is one of the valid forms
all_options = [corrected] + alts
all_lower = [w.lower() for w in all_options]
# "stammeln" must be among the candidates
assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}"
def test_known_word_not_fixed(self):
# "Haus" is correct — no fix needed
result = _try_spell_fix("Haus", col_type="column_de")
# Should be None since the word is correct
if result is not None:
corrected, _, _ = result
assert corrected.lower() == "haus"
def test_short_word_skipped(self):
result = _try_spell_fix("ab")
assert result is None
def test_min_word_len_thresholds(self):
assert _MIN_WORD_LEN_HYPHEN == 2
assert _MIN_WORD_LEN_SPELL == 3
# ---------------------------------------------------------------------------
# Grid analysis tests
# ---------------------------------------------------------------------------
def _make_grid(cells, columns=None):
"""Helper to create a minimal grid_data structure."""
if columns is None:
columns = [
{"index": 0, "type": "column_en", "x_min_px": 0, "x_max_px": 200},
{"index": 1, "type": "column_de", "x_min_px": 200, "x_max_px": 400},
{"index": 2, "type": "column_text", "x_min_px": 400, "x_max_px": 600},
]
return {
"image_width": 600,
"image_height": 800,
"zones": [{
"columns": columns,
"cells": cells,
}],
}
def _make_cell(row, col, text, left=0, width=50, col_width=200, col_x=0):
"""Helper to create a cell dict with word_boxes at a specific position."""
return {
"cell_id": f"R{row:02d}_C{col}",
"row_index": row,
"col_index": col,
"col_type": "column_text",
"text": text,
"confidence": 90.0,
"bbox_px": {"x": left, "y": row * 25, "w": width, "h": 20},
"word_boxes": [
{"text": text, "left": left, "top": row * 25, "width": width, "height": 20, "conf": 90},
],
}
@needs_spellchecker
class TestAnalyseGrid:
def test_empty_grid(self):
result = analyse_grid_for_gutter_repair({"zones": []})
assert result["suggestions"] == []
assert result["stats"]["words_checked"] == 0
def test_detects_spell_fix_at_edge(self):
# "stammeli" at position 160 in a column 0-200 wide = 80% = at gutter
cells = [
_make_cell(29, 2, "stammeli", left=540, width=55, col_width=200, col_x=400),
]
grid = _make_grid(cells)
result = analyse_grid_for_gutter_repair(grid)
suggestions = result["suggestions"]
assert len(suggestions) >= 1
assert suggestions[0]["type"] == "spell_fix"
assert suggestions[0]["suggested_text"] == "stammeln"
def test_detects_hyphen_join(self):
# Row 30: "ve" at gutter edge, Row 31: "künden"
cells = [
_make_cell(30, 2, "ve", left=570, width=25, col_width=200, col_x=400),
_make_cell(31, 2, "künden", left=410, width=80, col_width=200, col_x=400),
]
grid = _make_grid(cells)
result = analyse_grid_for_gutter_repair(grid)
suggestions = result["suggestions"]
# Should find hyphen_join or spell_fix
assert len(suggestions) >= 1
def test_ignores_known_words(self):
# "hello" is a known word — should not be suggested
cells = [
_make_cell(0, 0, "hello", left=160, width=35),
]
grid = _make_grid(cells)
result = analyse_grid_for_gutter_repair(grid)
# Should not suggest anything for known words
spell_fixes = [s for s in result["suggestions"] if s["original_text"] == "hello"]
assert len(spell_fixes) == 0
def test_ignores_words_not_at_edge(self):
# "stammeli" at position 10 = NOT at gutter edge
cells = [
_make_cell(0, 0, "stammeli", left=10, width=50),
]
grid = _make_grid(cells)
result = analyse_grid_for_gutter_repair(grid)
assert len(result["suggestions"]) == 0
# ---------------------------------------------------------------------------
# Apply suggestions tests
# ---------------------------------------------------------------------------
class TestApplySuggestions:
def test_apply_spell_fix(self):
cells = [
{"cell_id": "R29_C2", "row_index": 29, "col_index": 2,
"text": "er stammeli", "word_boxes": []},
]
grid = _make_grid(cells)
suggestions = [{
"id": "abc",
"type": "spell_fix",
"zone_index": 0,
"row_index": 29,
"col_index": 2,
"original_text": "stammeli",
"suggested_text": "stammeln",
}]
result = apply_gutter_suggestions(grid, ["abc"], suggestions)
assert result["applied_count"] == 1
assert grid["zones"][0]["cells"][0]["text"] == "er stammeln"
def test_apply_hyphen_join(self):
cells = [
{"cell_id": "R30_C2", "row_index": 30, "col_index": 2,
"text": "ve", "word_boxes": []},
{"cell_id": "R31_C2", "row_index": 31, "col_index": 2,
"text": "künden und", "word_boxes": []},
]
grid = _make_grid(cells)
suggestions = [{
"id": "def",
"type": "hyphen_join",
"zone_index": 0,
"row_index": 30,
"col_index": 2,
"original_text": "ve",
"suggested_text": "verkünden",
"next_row_index": 31,
"display_parts": ["ver-", "künden"],
"missing_chars": "r",
}]
result = apply_gutter_suggestions(grid, ["def"], suggestions)
assert result["applied_count"] == 1
# Current row: "ve" replaced with "ver-"
assert grid["zones"][0]["cells"][0]["text"] == "ver-"
# Next row: "künden" removed, "und" remains
assert grid["zones"][0]["cells"][1]["text"] == "und"
def test_apply_nothing_when_no_accepted(self):
grid = _make_grid([])
result = apply_gutter_suggestions(grid, [], [])
assert result["applied_count"] == 0
def test_skip_unknown_suggestion_id(self):
cells = [
{"cell_id": "R0_C0", "row_index": 0, "col_index": 0,
"text": "test", "word_boxes": []},
]
grid = _make_grid(cells)
suggestions = [{
"id": "abc",
"type": "spell_fix",
"zone_index": 0,
"row_index": 0,
"col_index": 0,
"original_text": "test",
"suggested_text": "test2",
}]
# Accept a non-existent ID
result = apply_gutter_suggestions(grid, ["nonexistent"], suggestions)
assert result["applied_count"] == 0
assert grid["zones"][0]["cells"][0]["text"] == "test"