Fix: merge cell-wrap continuation rows in vocabulary extraction
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 58s
CI / test-go-edu-search (push) Successful in 48s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has started running
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 58s
CI / test-go-edu-search (push) Successful in 48s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has started running
When textbook authors wrap text within a cell (e.g. long German translations), OCR treats each physical line as a separate row. New _merge_wrapped_rows() detects this by checking if the primary column (EN) is empty — indicating a continuation, not a new entry. Handles: empty EN + DE text, empty EN + example text, parenthetical continuations like "(bei)", triple wraps, comma-separated lists. 12 tests added covering all cases. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
135
klausur-service/backend/tests/test_merge_wrapped_rows.py
Normal file
135
klausur-service/backend/tests/test_merge_wrapped_rows.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""Tests for _merge_wrapped_rows — cell-wrap continuation row merging."""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from cv_cell_grid import _merge_wrapped_rows
|
||||
|
||||
|
||||
def _entry(row_index, english='', german='', example=''):
|
||||
return {
|
||||
'row_index': row_index,
|
||||
'english': english,
|
||||
'german': german,
|
||||
'example': example,
|
||||
}
|
||||
|
||||
|
||||
class TestMergeWrappedRows:
|
||||
"""Test cell-wrap continuation row merging."""
|
||||
|
||||
def test_basic_en_empty_merge(self):
|
||||
"""EN empty, DE has text → merge DE into previous row."""
|
||||
entries = [
|
||||
_entry(0, english='take part (in)', german='teilnehmen (an), mitmachen', example='More than 200 singers took'),
|
||||
_entry(1, english='', german='(bei)', example='part in the concert.'),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]['german'] == 'teilnehmen (an), mitmachen (bei)'
|
||||
assert result[0]['example'] == 'More than 200 singers took part in the concert.'
|
||||
|
||||
def test_en_empty_de_only(self):
|
||||
"""EN empty, only DE continuation (no example)."""
|
||||
entries = [
|
||||
_entry(0, english='competition', german='der Wettbewerb,'),
|
||||
_entry(1, english='', german='das Turnier'),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]['german'] == 'der Wettbewerb, das Turnier'
|
||||
|
||||
def test_en_empty_example_only(self):
|
||||
"""EN empty, only example continuation."""
|
||||
entries = [
|
||||
_entry(0, english='to arrive', german='ankommen', example='We arrived at the'),
|
||||
_entry(1, english='', german='', example='hotel at midnight.'),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]['example'] == 'We arrived at the hotel at midnight.'
|
||||
|
||||
def test_de_empty_paren_continuation(self):
|
||||
"""DE empty, EN starts with parenthetical → merge into previous EN."""
|
||||
entries = [
|
||||
_entry(0, english='to take part', german='teilnehmen'),
|
||||
_entry(1, english='(in)', german=''),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]['english'] == 'to take part (in)'
|
||||
|
||||
def test_de_empty_lowercase_continuation(self):
|
||||
"""DE empty, EN starts lowercase → merge into previous EN."""
|
||||
entries = [
|
||||
_entry(0, english='to put up', german='aufstellen'),
|
||||
_entry(1, english='with sth.', german=''),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]['english'] == 'to put up with sth.'
|
||||
|
||||
def test_no_merge_both_have_content(self):
|
||||
"""Both EN and DE have text → normal row, don't merge."""
|
||||
entries = [
|
||||
_entry(0, english='house', german='Haus'),
|
||||
_entry(1, english='garden', german='Garten'),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_no_merge_new_word_uppercase(self):
|
||||
"""EN has uppercase text, DE is empty → could be a new word, not merged."""
|
||||
entries = [
|
||||
_entry(0, english='house', german='Haus'),
|
||||
_entry(1, english='Garden', german=''),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 2
|
||||
|
||||
def test_triple_wrap(self):
|
||||
"""Three consecutive wrapped rows → all merge into first."""
|
||||
entries = [
|
||||
_entry(0, english='competition', german='der Wettbewerb,'),
|
||||
_entry(1, english='', german='das Turnier,'),
|
||||
_entry(2, english='', german='der Wettkampf'),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]['german'] == 'der Wettbewerb, das Turnier, der Wettkampf'
|
||||
|
||||
def test_empty_entries(self):
|
||||
"""Empty list."""
|
||||
assert _merge_wrapped_rows([]) == []
|
||||
|
||||
def test_single_entry(self):
|
||||
"""Single entry unchanged."""
|
||||
entries = [_entry(0, english='house', german='Haus')]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
|
||||
def test_mixed_normal_and_wrapped(self):
|
||||
"""Mix of normal rows and wrapped rows."""
|
||||
entries = [
|
||||
_entry(0, english='house', german='Haus'),
|
||||
_entry(1, english='take part (in)', german='teilnehmen (an),'),
|
||||
_entry(2, english='', german='mitmachen (bei)'),
|
||||
_entry(3, english='garden', german='Garten'),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 3
|
||||
assert result[0]['english'] == 'house'
|
||||
assert result[1]['german'] == 'teilnehmen (an), mitmachen (bei)'
|
||||
assert result[2]['english'] == 'garden'
|
||||
|
||||
def test_comma_separator_handling(self):
|
||||
"""Previous DE ends with comma → no extra space needed."""
|
||||
entries = [
|
||||
_entry(0, english='word', german='Wort,'),
|
||||
_entry(1, english='', german='Ausdruck'),
|
||||
]
|
||||
result = _merge_wrapped_rows(entries)
|
||||
assert len(result) == 1
|
||||
assert result[0]['german'] == 'Wort, Ausdruck'
|
||||
Reference in New Issue
Block a user