Bitte zuerst eine Session auswaehlen.
}
@@ -197,7 +291,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
if (status === 'error') {
return (
-
⚠️
+
⚠️
Fehler
{error}
@@ -207,7 +301,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
@@ -217,14 +311,14 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
if (status === 'saved') {
return (
+ {/* Undo/Redo */}
+
+
+
+
+
+ {/* Empty field toggle */}
+
+
+
+
{/* Zoom controls */}
{zoom}%
+ {/* Empty field markers */}
+ {showEmptyHighlight && allCells
+ .filter(c => emptyCellIds.has(c.cellId))
+ .map(cell => (
+
+ ))}
+
{/* Editable text fields at bbox positions */}
{cells.map((cell) => {
const displayText = getDisplayText(cell)
const edited = isEdited(cell)
return (
-
handleTextChange(cell.cellId, e.target.value)}
- onKeyDown={(e) => handleKeyDown(e, cell.cellId)}
- className={`absolute bg-transparent text-black dark:text-white border px-0.5 outline-none transition-colors ${
- colTypeColor(cell.colType)
- } ${edited ? 'border-green-500 bg-green-50/30 dark:bg-green-900/20' : ''}`}
- style={{
- left: `${cell.bboxPct.x}%`,
- top: `${cell.bboxPct.y}%`,
- width: `${cell.bboxPct.w}%`,
- height: `${cell.bboxPct.h}%`,
- fontSize: `${Math.max(8, Math.min(16, (cell.bboxPct.h / 100) * (containerSize?.h || 800) * 0.6))}px`,
- lineHeight: '1',
- }}
- title={`${cell.cellId} (${cell.colType})`}
- />
+
+ handleTextChange(cell.cellId, e.target.value)}
+ onKeyDown={(e) => handleKeyDown(e, cell.cellId)}
+ className={`w-full h-full bg-transparent text-black dark:text-white border px-0.5 outline-none transition-colors ${
+ colTypeColor(cell.colType)
+ } ${edited ? 'border-green-500 bg-green-50/30 dark:bg-green-900/20' : ''}`}
+ style={{
+ fontSize: `${getFontSize(cell.bboxPct.h)}px`,
+ lineHeight: '1',
+ }}
+ title={`${cell.cellId} (${cell.colType})`}
+ />
+ {/* Per-cell reset button (X) — only shown for edited cells on hover */}
+ {edited && (
+
+ )}
+
)
})}
@@ -336,7 +497,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
}}
className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium text-sm"
>
- {changedCount > 0 ? 'Speichern & Weiter →' : 'Weiter →'}
+ {changedCount > 0 ? 'Speichern & Weiter \u2192' : 'Weiter \u2192'}
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 700bc9f..a7be612 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3503,6 +3503,21 @@ def _ocr_single_cell(
)
used_engine = 'cell_ocr_fallback'
+ # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+ if not text.strip() and _run_fallback and not use_rapid:
+ cell_lang = lang_map.get(col.type, lang)
+ psm7_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
+ if psm7_words:
+ psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if psm7_words:
+ p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+ if p7_text.strip():
+ text = p7_text
+ avg_conf = round(
+ sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+ )
+ used_engine = 'cell_ocr_psm7'
+
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
if text.strip():
text = _clean_cell_text(text)
@@ -3628,6 +3643,79 @@ def build_cell_grid(
)
cells.append(cell)
+ # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+ # Collect cells that are still empty but have visible pixels.
+ # Instead of calling Tesseract once per cell (expensive), crop an entire
+ # column strip and run OCR once, then assign words to cells by Y position.
+ empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
+ for ci, cell in enumerate(cells):
+ if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+ bpx = cell['bbox_px']
+ x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+ if w > 0 and h > 0 and ocr_img is not None:
+ crop = ocr_img[y:y + h, x:x + w]
+ if crop.size > 0:
+ dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+ if dark_ratio > 0.005:
+ empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+ for col_idx, cell_indices in empty_by_col.items():
+ if len(cell_indices) < 3:
+ continue # Not worth batching for < 3 cells
+
+ # Find the column strip bounding box (union of all empty cell bboxes)
+ min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+ max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+ col_x = cells[cell_indices[0]]['bbox_px']['x']
+ col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+ strip_region = PageRegion(
+ type=relevant_cols[col_idx].type,
+ x=col_x, y=min_y,
+ width=col_w, height=max_y_h - min_y,
+ )
+ strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+ if use_rapid and img_bgr is not None:
+ strip_words = ocr_region_rapid(img_bgr, strip_region)
+ else:
+ strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+ if not strip_words:
+ continue
+
+ strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+ if not strip_words:
+ continue
+
+ # Assign words to cells by Y overlap
+ for ci in cell_indices:
+ cell_y = cells[ci]['bbox_px']['y']
+ cell_h = cells[ci]['bbox_px']['h']
+ cell_mid_y = cell_y + cell_h / 2
+
+ matched_words = [
+ w for w in strip_words
+ if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+ ]
+ if matched_words:
+ matched_words.sort(key=lambda w: w['left'])
+ batch_text = ' '.join(w['text'] for w in matched_words)
+ batch_text = _clean_cell_text(batch_text)
+ if batch_text.strip():
+ cells[ci]['text'] = batch_text
+ cells[ci]['confidence'] = round(
+ sum(w['conf'] for w in matched_words) / len(matched_words), 1
+ )
+ cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+ batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+ if batch_filled > 0:
+ logger.info(
+ f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+ f"empty cells in column {col_idx}"
+ )
+
logger.info(f"build_cell_grid: {len(cells)} cells from "
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
f"engine={engine_name}")
@@ -3869,6 +3957,69 @@ def _merge_phonetic_continuation_rows(
return merged
+def _merge_continuation_rows(
+ entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Merge multi-line vocabulary entries where text wraps to the next row.
+
+ A row is a continuation of the previous entry when:
+ - EN has text, but DE is empty
+ - EN starts with a lowercase letter (not a new vocab entry)
+ - Previous entry's EN does NOT end with a sentence terminator (.!?)
+ - The continuation text has fewer than 4 words (not an example sentence)
+ - The row was not already merged as phonetic
+
+ Example:
+ Row 5: EN="to put up" DE="aufstellen"
+ Row 6: EN="with sth." DE=""
+ → Merged: EN="to put up with sth." DE="aufstellen"
+ """
+ if len(entries) < 2:
+ return entries
+
+ merged: List[Dict[str, Any]] = []
+ for entry in entries:
+ en = (entry.get('english') or '').strip()
+ de = (entry.get('german') or '').strip()
+
+ if merged and en and not de:
+ # Check: not phonetic (already handled)
+ if _is_phonetic_only_text(en):
+ merged.append(entry)
+ continue
+
+ # Check: starts with lowercase
+ first_alpha = next((c for c in en if c.isalpha()), '')
+ starts_lower = first_alpha and first_alpha.islower()
+
+ # Check: fewer than 4 words (not an example sentence)
+ word_count = len(en.split())
+ is_short = word_count < 4
+
+ # Check: previous entry doesn't end with sentence terminator
+ prev = merged[-1]
+ prev_en = (prev.get('english') or '').strip()
+ prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+ if starts_lower and is_short and not prev_ends_sentence:
+ # Merge into previous entry
+ prev['english'] = (prev_en + ' ' + en).strip()
+ # Merge example if present
+ ex = (entry.get('example') or '').strip()
+ if ex:
+ prev_ex = (prev.get('example') or '').strip()
+ prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+ logger.debug(
+ f"Merged continuation row {entry.get('row_index')} "
+ f"into previous entry: {prev['english']!r}"
+ )
+ continue
+
+ merged.append(entry)
+
+ return merged
+
+
def build_word_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
@@ -3920,9 +4071,12 @@ def build_word_grid(
# --- Post-processing pipeline (deterministic, no LLM) ---
n_raw = len(entries)
- # 0. Merge phonetic-only continuation rows into previous entry
+ # 0a. Merge phonetic-only continuation rows into previous entry
entries = _merge_phonetic_continuation_rows(entries)
+ # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+ entries = _merge_continuation_rows(entries)
+
# 1. Fix character confusion (I/1/l based on context)
entries = _fix_character_confusion(entries)
@@ -4361,7 +4515,7 @@ async def run_cv_pipeline(
# Stage 3: Dewarp
if enable_dewarp:
t = time.time()
- img = dewarp_image(img)
+ img, _dewarp_info = dewarp_image(img)
result.stages['dewarp'] = round(time.time() - t, 2)
# Stage 4: Dual image preparation
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index afb0a81..cba9b80 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1623,6 +1623,69 @@ async def save_reconstruction(session_id: str, request: Request):
}
+@router.post("/sessions/{session_id}/reprocess")
+async def reprocess_session(session_id: str, request: Request):
+ """Re-run pipeline from a specific step, clearing downstream data.
+
+ Body: {"from_step": 5} (1-indexed step number)
+
+ Clears downstream results:
+ - from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result
+ - from_step <= 2: dewarp_result, column_result, row_result, word_result
+ - from_step <= 3: column_result, row_result, word_result
+ - from_step <= 4: row_result, word_result
+ - from_step <= 5: word_result (cells, vocab_entries)
+ - from_step <= 6: word_result.llm_review only
+ """
+ session = await get_session_db(session_id)
+ if not session:
+ raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+ body = await request.json()
+ from_step = body.get("from_step", 1)
+ if not isinstance(from_step, int) or from_step < 1 or from_step > 7:
+ raise HTTPException(status_code=400, detail="from_step must be between 1 and 7")
+
+ update_kwargs: Dict[str, Any] = {"current_step": from_step}
+
+ # Clear downstream data based on from_step
+ if from_step <= 5:
+ update_kwargs["word_result"] = None
+ elif from_step == 6:
+ # Only clear LLM review from word_result
+ word_result = session.get("word_result")
+ if word_result:
+ word_result.pop("llm_review", None)
+ word_result.pop("llm_corrections", None)
+ update_kwargs["word_result"] = word_result
+
+ if from_step <= 4:
+ update_kwargs["row_result"] = None
+ if from_step <= 3:
+ update_kwargs["column_result"] = None
+ if from_step <= 2:
+ update_kwargs["dewarp_result"] = None
+ if from_step <= 1:
+ update_kwargs["deskew_result"] = None
+
+ await update_session_db(session_id, **update_kwargs)
+
+ # Also clear cache
+ if session_id in _cache:
+ for key in list(update_kwargs.keys()):
+ if key != "current_step":
+ _cache[session_id][key] = update_kwargs[key]
+ _cache[session_id]["current_step"] = from_step
+
+ logger.info(f"Session {session_id} reprocessing from step {from_step}")
+
+ return {
+ "session_id": session_id,
+ "from_step": from_step,
+ "cleared": [k for k in update_kwargs if k != "current_step"],
+ }
+
+
async def _get_rows_overlay(session_id: str) -> Response:
"""Generate dewarped image with row bands drawn on it."""
session = await get_session_db(session_id)
diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py
index a1c77f1..4e17cd9 100644
--- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py
+++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py
@@ -9,6 +9,9 @@ Tests cover:
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
- Stage 6: Multi-pass OCR region handling
- Stage 7: Line grouping and vocabulary matching
+- Noise filter functions (_is_noise_tail_token, _clean_cell_text)
+- Phonetic detection (_is_phonetic_only_text)
+- Phonetic & continuation row merging
- Orchestrator (run_cv_pipeline)
DSGVO Note: All tests run locally with synthetic data. No external API calls.
@@ -36,6 +39,11 @@ from cv_vocab_pipeline import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
CV_PIPELINE_AVAILABLE,
+ _is_noise_tail_token,
+ _clean_cell_text,
+ _is_phonetic_only_text,
+ _merge_phonetic_continuation_rows,
+ _merge_continuation_rows,
)
@@ -202,16 +210,28 @@ class TestDeskew:
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
class TestDewarp:
- """Test dewarp (pass-through) stage."""
+ """Test dewarp stage (returns (image, info) tuple)."""
- def test_dewarp_passthrough(self, white_image):
- """Current dewarp should return the same image (pass-through)."""
+ def test_dewarp_returns_tuple(self, white_image):
+ """dewarp_image must return (image, dewarp_info) tuple."""
result = dewarp_image(white_image)
- np.testing.assert_array_equal(result, white_image)
+ assert isinstance(result, tuple)
+ assert len(result) == 2
+ img_out, info = result
+ assert isinstance(img_out, np.ndarray)
+ assert isinstance(info, dict)
+ assert "shear_degrees" in info
def test_dewarp_preserves_shape(self, text_like_image):
- result = dewarp_image(text_like_image)
- assert result.shape == text_like_image.shape
+ """Output image should have same shape as input."""
+ img_out, _ = dewarp_image(text_like_image)
+ assert img_out.shape == text_like_image.shape
+
+ def test_dewarp_white_image_no_correction(self, white_image):
+ """A uniform white image should get no shear correction."""
+ img_out, info = dewarp_image(white_image)
+ assert abs(info["shear_degrees"]) < 0.5
+ assert img_out.shape == white_image.shape
# =============================================
@@ -561,6 +581,268 @@ class TestStageIntegration:
assert layout_img.shape[:2] == corrected.shape[:2]
+# =============================================
+# NOISE FILTER TESTS
+# =============================================
+
+class TestNoiseFilter:
+ """Test _is_noise_tail_token for trailing OCR noise detection."""
+
+ # --- Tokens that should be KEPT (return False) ---
+
+ @pytest.mark.parametrize("token", [
+ # Compound words with hyphens
+ "money-saver",
+ "under-",
+ "well-known",
+ # Words with parenthesized parts (dictionary entries)
+ "Schild(chen)",
+ "(Salat-)Gurke",
+ "(auf)",
+ "(on)",
+ "selbst)",
+ "(wir",
+ "Tanz(veranstaltung)",
+ "(zer)brechen",
+ # Phonetic brackets
+ "serva]",
+ "['mani",
+ "[eg]",
+ "[maus]",
+ # Words with trailing punctuation
+ "cupcakes.",
+ "sister.",
+ "mice",
+ # Abbreviations
+ "e.g.",
+ "sth.",
+ "usw.",
+ "adj.",
+ # Ellipsis
+ "...",
+ "\u2026",
+ # Regular words
+ "the",
+ "cat",
+ "big",
+ "run",
+ "set",
+ "ago",
+ ])
+ def test_keep_real_tokens(self, token):
+ """Real words, dictionary punctuation, and phonetic brackets are kept."""
+ assert _is_noise_tail_token(token) is False, f"Should keep {token!r}"
+
+ # --- Tokens that should be FILTERED (return True) ---
+
+ @pytest.mark.parametrize("token", [
+ # Pure non-alpha
+ "B|",
+ "3d",
+ "x7",
+ ")",
+ "|",
+ "@",
+ "3",
+ # Very short non-dictionary fragments
+ "ee",
+ "k",
+ "zz",
+ "qq",
+ # Empty
+ "",
+ " ",
+ ])
+ def test_filter_noise_tokens(self, token):
+ """OCR noise fragments are filtered."""
+ assert _is_noise_tail_token(token) is True, f"Should filter {token!r}"
+
+
+class TestCleanCellText:
+ """Test _clean_cell_text integration (full text → cleaned text)."""
+
+ def test_empty_returns_empty(self):
+ assert _clean_cell_text("") == ""
+ assert _clean_cell_text(" ") == ""
+
+ def test_real_word_unchanged(self):
+ assert _clean_cell_text("cupcakes") == "cupcakes"
+
+ def test_strips_trailing_noise(self):
+ """Trailing noise tokens should be removed."""
+ result = _clean_cell_text("cupcakes B|")
+ assert result == "cupcakes"
+
+ def test_keeps_trailing_real_word(self):
+ """Trailing real words should be kept."""
+ result = _clean_cell_text("big cat")
+ assert result == "big cat"
+
+ def test_abbreviation_kept(self):
+ """Known abbreviations should not be cleared."""
+ result = _clean_cell_text("e.g.")
+ assert result == "e.g."
+
+ def test_pure_garbage_cleared(self):
+ """OCR garbage without real words should be cleared."""
+ result = _clean_cell_text("3d |x")
+ assert result == ""
+
+ def test_compound_word_preserved(self):
+ """Compound words with hyphens should be preserved."""
+ result = _clean_cell_text("money-saver")
+ assert result == "money-saver"
+
+ def test_parenthesized_word_preserved(self):
+ result = _clean_cell_text("(Salat-)Gurke")
+ assert result == "(Salat-)Gurke"
+
+ def test_multiple_trailing_noise(self):
+ """Multiple trailing noise tokens should all be removed."""
+ result = _clean_cell_text("achieve 3 |")
+ assert result == "achieve"
+
+
+class TestPhoneticOnlyText:
+ """Test _is_phonetic_only_text for phonetic transcription detection."""
+
+ @pytest.mark.parametrize("text,expected", [
+ # Phonetic-only patterns → True
+ ("['mani serva]", True),
+ ("[dɑːns]", True),
+ ("[\"a:mand]", True),
+ ("['wɜːkʃɒp]", True),
+ # serva] has 5 alpha chars after bracket removal → NOT phonetic-only
+ ("serva]", False),
+ # NOT phonetic-only → False
+ ("almond ['a:mand]", False),
+ ("Mandel", False),
+ ("cupcakes", False),
+ ("", False),
+ ("achieve", False),
+ ("money-saver ['mani]", False),
+ ])
+ def test_phonetic_detection(self, text, expected):
+ assert _is_phonetic_only_text(text) is expected, \
+ f"_is_phonetic_only_text({text!r}) should be {expected}"
+
+
+class TestMergePhoneticContinuationRows:
+ """Test _merge_phonetic_continuation_rows for phonetic row merging."""
+
+ def test_empty_list(self):
+ assert _merge_phonetic_continuation_rows([]) == []
+
+ def test_single_entry(self):
+ entries = [{"english": "cat", "german": "Katze", "example": ""}]
+ result = _merge_phonetic_continuation_rows(entries)
+ assert len(result) == 1
+ assert result[0]["english"] == "cat"
+
+ def test_merges_phonetic_row(self):
+ """Phonetic-only row should merge into previous entry."""
+ entries = [
+ {"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0},
+ {"english": "['mani serva]", "german": "", "example": "", "row_index": 1},
+ ]
+ result = _merge_phonetic_continuation_rows(entries)
+ assert len(result) == 1
+ assert result[0]["english"] == "money-saver ['mani serva]"
+ assert result[0]["german"] == "Sparfuchs"
+
+ def test_no_merge_when_de_present(self):
+ """Row with DE text should NOT be merged even if EN looks phonetic."""
+ entries = [
+ {"english": "cat", "german": "Katze", "example": ""},
+ {"english": "[kæt]", "german": "some text", "example": ""},
+ ]
+ result = _merge_phonetic_continuation_rows(entries)
+ assert len(result) == 2
+
+ def test_no_merge_regular_rows(self):
+ """Normal vocab rows should not be merged."""
+ entries = [
+ {"english": "cat", "german": "Katze", "example": ""},
+ {"english": "dog", "german": "Hund", "example": ""},
+ ]
+ result = _merge_phonetic_continuation_rows(entries)
+ assert len(result) == 2
+
+ def test_merges_example_too(self):
+ """If phonetic row has example text, it should merge into previous."""
+ entries = [
+ {"english": "dance", "german": "tanzen", "example": "", "row_index": 0},
+ {"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1},
+ ]
+ result = _merge_phonetic_continuation_rows(entries)
+ assert len(result) == 1
+ assert result[0]["english"] == "dance [dɑːns]"
+ assert result[0]["example"] == "Let's dance."
+
+
+class TestMergeContinuationRows:
+ """Test _merge_continuation_rows for multi-line entry merging."""
+
+ def test_empty_list(self):
+ assert _merge_continuation_rows([]) == []
+
+ def test_no_merge_independent_rows(self):
+ """Rows with both EN and DE should not be merged."""
+ entries = [
+ {"english": "cat", "german": "Katze", "example": "", "row_index": 0},
+ {"english": "dog", "german": "Hund", "example": "", "row_index": 1},
+ ]
+ result = _merge_continuation_rows(entries)
+ assert len(result) == 2
+
+ def test_merge_lowercase_continuation(self):
+ """Lowercase EN with empty DE should merge into previous."""
+ entries = [
+ {"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0},
+ {"english": "with sth.", "german": "", "example": "", "row_index": 1},
+ ]
+ result = _merge_continuation_rows(entries)
+ assert len(result) == 1
+ assert result[0]["english"] == "to put up with sth."
+ assert result[0]["german"] == "aufstellen"
+
+ def test_no_merge_uppercase_start(self):
+ """EN starting with uppercase and empty DE is likely its own entry, not a continuation."""
+ entries = [
+ {"english": "cat", "german": "Katze", "example": "", "row_index": 0},
+ {"english": "Dog", "german": "", "example": "", "row_index": 1},
+ ]
+ result = _merge_continuation_rows(entries)
+ assert len(result) == 2
+
+ def test_no_merge_when_previous_ends_with_period(self):
+ """If previous entry ends with sentence terminator, next is not continuation."""
+ entries = [
+ {"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0},
+ {"english": "really nice", "german": "", "example": "", "row_index": 1},
+ ]
+ result = _merge_continuation_rows(entries)
+ assert len(result) == 2
+
+ def test_no_merge_long_text(self):
+ """Text with 4+ words is likely an example sentence, not continuation."""
+ entries = [
+ {"english": "achieve", "german": "erreichen", "example": "", "row_index": 0},
+ {"english": "she achieved her goals", "german": "", "example": "", "row_index": 1},
+ ]
+ result = _merge_continuation_rows(entries)
+ assert len(result) == 2
+
+ def test_first_entry_not_merged(self):
+ """First entry with empty DE should not crash (no previous)."""
+ entries = [
+ {"english": "something", "german": "", "example": "", "row_index": 0},
+ {"english": "cat", "german": "Katze", "example": "", "row_index": 1},
+ ]
+ result = _merge_continuation_rows(entries)
+ assert len(result) == 2
+
+
# =============================================
# RUN TESTS
# =============================================