From b8f1b716524c512faf33f22113a396a04fe25911 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 16 Apr 2026 08:32:45 +0200 Subject: [PATCH] Fix: merge cell-wrap continuation rows in vocabulary extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When textbook authors wrap text within a cell (e.g. long German translations), OCR treats each physical line as a separate row. New _merge_wrapped_rows() detects this by checking if the primary column (EN) is empty — indicating a continuation, not a new entry. Handles: empty EN + DE text, empty EN + example text, parenthetical continuations like "(bei)", triple wraps, comma-separated lists. 12 tests added covering all cases. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/cv_cell_grid.py | 87 +++++++++++ .../backend/tests/test_merge_wrapped_rows.py | 135 ++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 klausur-service/backend/tests/test_merge_wrapped_rows.py diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py index db1a8f6..56fd472 100644 --- a/klausur-service/backend/cv_cell_grid.py +++ b/klausur-service/backend/cv_cell_grid.py @@ -1447,6 +1447,90 @@ def _merge_phonetic_continuation_rows( return merged +def _merge_wrapped_rows( + entries: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Merge rows where the primary column (EN) is empty — cell wrap continuation. + + In textbook vocabulary tables, columns are often narrow, so the author + wraps text within a cell. OCR treats each physical line as a separate row. + The key indicator: if the EN column is empty but DE/example have text, + this row is a continuation of the previous row's cells. + + Example (original textbook has ONE row): + Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took" + Row 3: EN="" DE="(bei)" EX="part in the concert." + → Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="More than 200 singers took part in the concert." + + Also handles the reverse case: DE empty but EN has text (wrap in EN column). + """ + if len(entries) < 2: + return entries + + merged: List[Dict[str, Any]] = [] + for entry in entries: + en = (entry.get('english') or '').strip() + de = (entry.get('german') or '').strip() + ex = (entry.get('example') or '').strip() + + if not merged: + merged.append(entry) + continue + + prev = merged[-1] + prev_en = (prev.get('english') or '').strip() + prev_de = (prev.get('german') or '').strip() + prev_ex = (prev.get('example') or '').strip() + + # Case 1: EN is empty → continuation of previous row + # (DE or EX have text that should be appended to previous row) + if not en and (de or ex) and prev_en: + if de: + if prev_de.endswith(','): + sep = ' ' # "Wort," + " " + "Ausdruck" + elif prev_de.endswith(('-', '(')): + sep = '' # "teil-" + "nehmen" or "(" + "bei)" + else: + sep = ' ' + prev['german'] = (prev_de + sep + de).strip() + if ex: + sep = ' ' if prev_ex else '' + prev['example'] = (prev_ex + sep + ex).strip() + logger.debug( + f"Merged wrapped row {entry.get('row_index')} into previous " + f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}" + ) + continue + + # Case 2: DE is empty, EN has text that looks like continuation + # (starts with lowercase or is a parenthetical like "(bei)") + if en and not de and prev_de: + is_paren = en.startswith('(') + first_alpha = next((c for c in en if c.isalpha()), '') + starts_lower = first_alpha and first_alpha.islower() + + if (is_paren or starts_lower) and len(en.split()) < 5: + sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else '' + prev['english'] = (prev_en + sep + en).strip() + if ex: + sep2 = ' ' if prev_ex else '' + prev['example'] = (prev_ex + sep2 + ex).strip() + logger.debug( + f"Merged wrapped row {entry.get('row_index')} into previous " + f"(empty DE): EN={prev['english']!r}" + ) + continue + + merged.append(entry) + + if len(merged) < len(entries): + logger.info( + f"_merge_wrapped_rows: merged {len(entries) - len(merged)} " + f"continuation rows ({len(entries)} → {len(merged)})" + ) + return merged + + def _merge_continuation_rows( entries: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: @@ -1561,6 +1645,9 @@ def build_word_grid( # --- Post-processing pipeline (deterministic, no LLM) --- n_raw = len(entries) + # 0. Merge cell-wrap continuation rows (empty primary column = text wrap) + entries = _merge_wrapped_rows(entries) + # 0a. Merge phonetic-only continuation rows into previous entry entries = _merge_phonetic_continuation_rows(entries) diff --git a/klausur-service/backend/tests/test_merge_wrapped_rows.py b/klausur-service/backend/tests/test_merge_wrapped_rows.py new file mode 100644 index 0000000..d90fcf7 --- /dev/null +++ b/klausur-service/backend/tests/test_merge_wrapped_rows.py @@ -0,0 +1,135 @@ +"""Tests for _merge_wrapped_rows — cell-wrap continuation row merging.""" + +import pytest +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +from cv_cell_grid import _merge_wrapped_rows + + +def _entry(row_index, english='', german='', example=''): + return { + 'row_index': row_index, + 'english': english, + 'german': german, + 'example': example, + } + + +class TestMergeWrappedRows: + """Test cell-wrap continuation row merging.""" + + def test_basic_en_empty_merge(self): + """EN empty, DE has text → merge DE into previous row.""" + entries = [ + _entry(0, english='take part (in)', german='teilnehmen (an), mitmachen', example='More than 200 singers took'), + _entry(1, english='', german='(bei)', example='part in the concert.'), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + assert result[0]['german'] == 'teilnehmen (an), mitmachen (bei)' + assert result[0]['example'] == 'More than 200 singers took part in the concert.' + + def test_en_empty_de_only(self): + """EN empty, only DE continuation (no example).""" + entries = [ + _entry(0, english='competition', german='der Wettbewerb,'), + _entry(1, english='', german='das Turnier'), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + assert result[0]['german'] == 'der Wettbewerb, das Turnier' + + def test_en_empty_example_only(self): + """EN empty, only example continuation.""" + entries = [ + _entry(0, english='to arrive', german='ankommen', example='We arrived at the'), + _entry(1, english='', german='', example='hotel at midnight.'), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + assert result[0]['example'] == 'We arrived at the hotel at midnight.' + + def test_de_empty_paren_continuation(self): + """DE empty, EN starts with parenthetical → merge into previous EN.""" + entries = [ + _entry(0, english='to take part', german='teilnehmen'), + _entry(1, english='(in)', german=''), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + assert result[0]['english'] == 'to take part (in)' + + def test_de_empty_lowercase_continuation(self): + """DE empty, EN starts lowercase → merge into previous EN.""" + entries = [ + _entry(0, english='to put up', german='aufstellen'), + _entry(1, english='with sth.', german=''), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + assert result[0]['english'] == 'to put up with sth.' + + def test_no_merge_both_have_content(self): + """Both EN and DE have text → normal row, don't merge.""" + entries = [ + _entry(0, english='house', german='Haus'), + _entry(1, english='garden', german='Garten'), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 2 + + def test_no_merge_new_word_uppercase(self): + """EN has uppercase text, DE is empty → could be a new word, not merged.""" + entries = [ + _entry(0, english='house', german='Haus'), + _entry(1, english='Garden', german=''), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 2 + + def test_triple_wrap(self): + """Three consecutive wrapped rows → all merge into first.""" + entries = [ + _entry(0, english='competition', german='der Wettbewerb,'), + _entry(1, english='', german='das Turnier,'), + _entry(2, english='', german='der Wettkampf'), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + assert result[0]['german'] == 'der Wettbewerb, das Turnier, der Wettkampf' + + def test_empty_entries(self): + """Empty list.""" + assert _merge_wrapped_rows([]) == [] + + def test_single_entry(self): + """Single entry unchanged.""" + entries = [_entry(0, english='house', german='Haus')] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + + def test_mixed_normal_and_wrapped(self): + """Mix of normal rows and wrapped rows.""" + entries = [ + _entry(0, english='house', german='Haus'), + _entry(1, english='take part (in)', german='teilnehmen (an),'), + _entry(2, english='', german='mitmachen (bei)'), + _entry(3, english='garden', german='Garten'), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 3 + assert result[0]['english'] == 'house' + assert result[1]['german'] == 'teilnehmen (an), mitmachen (bei)' + assert result[2]['english'] == 'garden' + + def test_comma_separator_handling(self): + """Previous DE ends with comma → no extra space needed.""" + entries = [ + _entry(0, english='word', german='Wort,'), + _entry(1, english='', german='Ausdruck'), + ] + result = _merge_wrapped_rows(entries) + assert len(result) == 1 + assert result[0]['german'] == 'Wort, Ausdruck'