From e718353d9fedc4e5dcda5f84fa53f158d78accaf Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 14:46:38 +0100 Subject: [PATCH] feat(ocr-pipeline): 6 systematic improvements for robustness, performance & UX 1. Unit tests: 76 new parametrized tests for noise filter, phonetic detection, cell text cleaning, and row merging (116 total, all green) 2. Continuation-row merge: detect multi-line vocab entries where text wraps (lowercase EN + empty DE) and merge into previous entry 3. Empty DE fallback: secondary PSM=7 OCR pass for cells missed by PSM=6 4. Batch-OCR: collect empty cells per column, run single Tesseract call on column strip instead of per-cell (~66% fewer calls for 3+ empty cells) 5. StepReconstruction UI: font scaling via naturalHeight, empty EN/DE field highlighting, undo/redo (Ctrl+Z), per-cell reset button 6. Session reprocess: POST /sessions/{id}/reprocess endpoint to re-run from any step, with reprocess button on completed pipeline steps Also fixes pre-existing dewarp_image tuple unpacking bug in run_cv_pipeline and updates dewarp tests to match current (image, info) return signature. Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-pipeline/page.tsx | 25 +- .../ocr-pipeline/PipelineStepper.tsx | 53 ++-- .../ocr-pipeline/StepReconstruction.tsx | 261 +++++++++++++--- klausur-service/backend/cv_vocab_pipeline.py | 158 +++++++++- klausur-service/backend/ocr_pipeline_api.py | 63 ++++ .../backend/tests/test_cv_vocab_pipeline.py | 294 +++++++++++++++++- 6 files changed, 775 insertions(+), 79 deletions(-) diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx index 2ebde8c..d7edf5a 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx @@ -160,6 +160,29 @@ export default function OcrPipelinePage() { 8: 'Validierung', } + const reprocessFromStep = useCallback(async (uiStep: number) => { + if (!sessionId) return + const dbStep = uiStep + 1 // UI is 0-indexed, DB is 1-indexed + if (!confirm(`Ab Schritt ${dbStep} (${stepNames[dbStep] || '?'}) neu verarbeiten? Nachfolgende Daten werden geloescht.`)) return + try { + const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/reprocess`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ from_step: dbStep }), + }) + if (!res.ok) { + const data = await res.json().catch(() => ({})) + console.error('Reprocess failed:', data.detail || res.status) + return + } + // Reset UI steps + goToStep(uiStep) + } catch (e) { + console.error('Reprocess error:', e) + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [sessionId, goToStep]) + const renderStep = () => { switch (currentStep) { case 0: @@ -291,7 +314,7 @@ export default function OcrPipelinePage() { )} - +
{renderStep()}
diff --git a/admin-lehrer/components/ocr-pipeline/PipelineStepper.tsx b/admin-lehrer/components/ocr-pipeline/PipelineStepper.tsx index ef16c98..e72b8cf 100644 --- a/admin-lehrer/components/ocr-pipeline/PipelineStepper.tsx +++ b/admin-lehrer/components/ocr-pipeline/PipelineStepper.tsx @@ -6,9 +6,10 @@ interface PipelineStepperProps { steps: PipelineStep[] currentStep: number onStepClick: (index: number) => void + onReprocess?: (index: number) => void } -export function PipelineStepper({ steps, currentStep, onStepClick }: PipelineStepperProps) { +export function PipelineStepper({ steps, currentStep, onStepClick, onReprocess }: PipelineStepperProps) { return (
{steps.map((step, index) => { @@ -26,25 +27,37 @@ export function PipelineStepper({ steps, currentStep, onStepClick }: PipelineSte }`} /> )} - +
+ + {/* Reprocess button — shown on completed steps on hover */} + {isCompleted && onReprocess && ( + + )} +
) })} diff --git a/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx b/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx index ea5c197..b564818 100644 --- a/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx @@ -20,13 +20,23 @@ interface EditableCell { colIndex: number } +type UndoAction = { cellId: string; oldText: string; newText: string } + export function StepReconstruction({ sessionId, onNext }: StepReconstructionProps) { const [status, setStatus] = useState<'loading' | 'ready' | 'saving' | 'saved' | 'error'>('loading') const [error, setError] = useState('') const [cells, setCells] = useState([]) const [editedTexts, setEditedTexts] = useState>(new Map()) const [zoom, setZoom] = useState(100) - const [containerSize, setContainerSize] = useState<{ w: number; h: number } | null>(null) + const [imageNaturalH, setImageNaturalH] = useState(0) + const [showEmptyHighlight, setShowEmptyHighlight] = useState(true) + + // Undo/Redo stacks + const [undoStack, setUndoStack] = useState([]) + const [redoStack, setRedoStack] = useState([]) + + // All cells including empty ones (for empty field highlighting) + const [allCells, setAllCells] = useState([]) const containerRef = useRef(null) const imageRef = useRef(null) @@ -38,16 +48,11 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp // eslint-disable-next-line react-hooks/exhaustive-deps }, [sessionId]) - // Track container size for font scaling - useEffect(() => { - if (!containerRef.current) return - const observer = new ResizeObserver((entries) => { - for (const entry of entries) { - setContainerSize({ w: entry.contentRect.width, h: entry.contentRect.height }) - } - }) - observer.observe(containerRef.current) - return () => observer.disconnect() + // Track image natural height for font scaling + const handleImageLoad = useCallback(() => { + if (imageRef.current) { + setImageNaturalH(imageRef.current.naturalHeight) + } }, []) const loadSessionData = async () => { @@ -67,19 +72,21 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp // Build editable cells from grid cells const gridCells: GridCell[] = wordResult.cells || [] - const editableCells: EditableCell[] = gridCells - .filter(c => c.text.trim() !== '') - .map(c => ({ - cellId: c.cell_id, - text: c.text, - originalText: c.text, - bboxPct: c.bbox_pct, - colType: c.col_type, - rowIndex: c.row_index, - colIndex: c.col_index, - })) + const allEditableCells: EditableCell[] = gridCells.map(c => ({ + cellId: c.cell_id, + text: c.text, + originalText: c.text, + bboxPct: c.bbox_pct, + colType: c.col_type, + rowIndex: c.row_index, + colIndex: c.col_index, + })) - setCells(editableCells) + setAllCells(allEditableCells) + setCells(allEditableCells.filter(c => c.text.trim() !== '')) + setEditedTexts(new Map()) + setUndoStack([]) + setRedoStack([]) setStatus('ready') } catch (e: unknown) { setError(e instanceof Error ? e.message : String(e)) @@ -89,12 +96,80 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp const handleTextChange = useCallback((cellId: string, newText: string) => { setEditedTexts(prev => { + const oldText = prev.get(cellId) + const cell = cells.find(c => c.cellId === cellId) + const prevText = oldText ?? cell?.text ?? '' + + // Push to undo stack + setUndoStack(stack => [...stack, { cellId, oldText: prevText, newText }]) + setRedoStack([]) // Clear redo on new edit + const next = new Map(prev) next.set(cellId, newText) return next }) + }, [cells]) + + const undo = useCallback(() => { + setUndoStack(stack => { + if (stack.length === 0) return stack + const action = stack[stack.length - 1] + const newStack = stack.slice(0, -1) + + setRedoStack(rs => [...rs, action]) + setEditedTexts(prev => { + const next = new Map(prev) + next.set(action.cellId, action.oldText) + return next + }) + + return newStack + }) }, []) + const redo = useCallback(() => { + setRedoStack(stack => { + if (stack.length === 0) return stack + const action = stack[stack.length - 1] + const newStack = stack.slice(0, -1) + + setUndoStack(us => [...us, action]) + setEditedTexts(prev => { + const next = new Map(prev) + next.set(action.cellId, action.newText) + return next + }) + + return newStack + }) + }, []) + + const resetCell = useCallback((cellId: string) => { + const cell = cells.find(c => c.cellId === cellId) + if (!cell) return + setEditedTexts(prev => { + const next = new Map(prev) + next.delete(cellId) + return next + }) + }, [cells]) + + // Global keyboard shortcuts for undo/redo + useEffect(() => { + const handler = (e: KeyboardEvent) => { + if ((e.metaKey || e.ctrlKey) && e.key === 'z') { + e.preventDefault() + if (e.shiftKey) { + redo() + } else { + undo() + } + } + } + document.addEventListener('keydown', handler) + return () => document.removeEventListener('keydown', handler) + }, [undo, redo]) + const getDisplayText = useCallback((cell: EditableCell): string => { return editedTexts.get(cell.cellId) ?? cell.text }, [editedTexts]) @@ -112,6 +187,18 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp return count }, [cells, isEdited]) + // Identify empty required cells (EN or DE columns with no text) + const emptyCellIds = useMemo(() => { + const required = new Set(['column_en', 'column_de']) + const ids = new Set() + for (const cell of allCells) { + if (required.has(cell.colType) && !cell.text.trim()) { + ids.add(cell.cellId) + } + } + return ids + }, [allCells]) + // Sort cells for tab navigation: by row, then by column const sortedCellIds = useMemo(() => { return [...cells] @@ -181,6 +268,13 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp return colors[colType] || 'border-gray-400/40 focus:border-gray-500' } + // Font size based on image natural height (not container) scaled by zoom + const getFontSize = useCallback((bboxH: number): number => { + const baseH = imageNaturalH || 800 + const px = (bboxH / 100) * baseH * 0.55 + return Math.max(8, Math.min(18, px * (zoom / 100))) + }, [imageNaturalH, zoom]) + if (!sessionId) { return
Bitte zuerst eine Session auswaehlen.
} @@ -197,7 +291,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp if (status === 'error') { return (
-
⚠️
+
⚠️

Fehler

{error}

@@ -207,7 +301,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
@@ -217,14 +311,14 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp if (status === 'saved') { return (
-
+

Rekonstruktion gespeichert

{changedCount > 0 ? `${changedCount} Zellen wurden aktualisiert.` : 'Keine Aenderungen vorgenommen.'}

) @@ -239,16 +333,54 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp Schritt 7: Rekonstruktion - {cells.length} Zellen · {changedCount} geaendert + {cells.length} Zellen · {changedCount} geaendert + {emptyCellIds.size > 0 && showEmptyHighlight && ( + · {emptyCellIds.size} leer + )}
+ {/* Undo/Redo */} + + + +
+ + {/* Empty field toggle */} + + +
+ {/* Zoom controls */} {zoom}% + )} +
) })}
@@ -336,7 +497,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp }} className="px-6 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors font-medium text-sm" > - {changedCount > 0 ? 'Speichern & Weiter →' : 'Weiter →'} + {changedCount > 0 ? 'Speichern & Weiter \u2192' : 'Weiter \u2192'}
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 700bc9f..a7be612 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3503,6 +3503,21 @@ def _ocr_single_cell( ) used_engine = 'cell_ocr_fallback' + # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells --- + if not text.strip() and _run_fallback and not use_rapid: + cell_lang = lang_map.get(col.type, lang) + psm7_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7) + if psm7_words: + psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF] + if psm7_words: + p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10) + if p7_text.strip(): + text = p7_text + avg_conf = round( + sum(w['conf'] for w in psm7_words) / len(psm7_words), 1 + ) + used_engine = 'cell_ocr_psm7' + # --- NOISE FILTER: clear cells that contain only OCR artifacts --- if text.strip(): text = _clean_cell_text(text) @@ -3628,6 +3643,79 @@ def build_cell_grid( ) cells.append(cell) + # --- BATCH FALLBACK: re-OCR empty cells by column strip --- + # Collect cells that are still empty but have visible pixels. + # Instead of calling Tesseract once per cell (expensive), crop an entire + # column strip and run OCR once, then assign words to cells by Y position. + empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices] + for ci, cell in enumerate(cells): + if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7': + bpx = cell['bbox_px'] + x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h'] + if w > 0 and h > 0 and ocr_img is not None: + crop = ocr_img[y:y + h, x:x + w] + if crop.size > 0: + dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size + if dark_ratio > 0.005: + empty_by_col.setdefault(cell['col_index'], []).append(ci) + + for col_idx, cell_indices in empty_by_col.items(): + if len(cell_indices) < 3: + continue # Not worth batching for < 3 cells + + # Find the column strip bounding box (union of all empty cell bboxes) + min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices) + max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices) + col_x = cells[cell_indices[0]]['bbox_px']['x'] + col_w = cells[cell_indices[0]]['bbox_px']['w'] + + strip_region = PageRegion( + type=relevant_cols[col_idx].type, + x=col_x, y=min_y, + width=col_w, height=max_y_h - min_y, + ) + strip_lang = lang_map.get(relevant_cols[col_idx].type, lang) + + if use_rapid and img_bgr is not None: + strip_words = ocr_region_rapid(img_bgr, strip_region) + else: + strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6) + + if not strip_words: + continue + + strip_words = [w for w in strip_words if w.get('conf', 0) >= 30] + if not strip_words: + continue + + # Assign words to cells by Y overlap + for ci in cell_indices: + cell_y = cells[ci]['bbox_px']['y'] + cell_h = cells[ci]['bbox_px']['h'] + cell_mid_y = cell_y + cell_h / 2 + + matched_words = [ + w for w in strip_words + if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8 + ] + if matched_words: + matched_words.sort(key=lambda w: w['left']) + batch_text = ' '.join(w['text'] for w in matched_words) + batch_text = _clean_cell_text(batch_text) + if batch_text.strip(): + cells[ci]['text'] = batch_text + cells[ci]['confidence'] = round( + sum(w['conf'] for w in matched_words) / len(matched_words), 1 + ) + cells[ci]['ocr_engine'] = 'batch_column_ocr' + + batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip()) + if batch_filled > 0: + logger.info( + f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} " + f"empty cells in column {col_idx}" + ) + logger.info(f"build_cell_grid: {len(cells)} cells from " f"{len(content_rows)} rows × {len(relevant_cols)} columns, " f"engine={engine_name}") @@ -3869,6 +3957,69 @@ def _merge_phonetic_continuation_rows( return merged +def _merge_continuation_rows( + entries: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Merge multi-line vocabulary entries where text wraps to the next row. + + A row is a continuation of the previous entry when: + - EN has text, but DE is empty + - EN starts with a lowercase letter (not a new vocab entry) + - Previous entry's EN does NOT end with a sentence terminator (.!?) + - The continuation text has fewer than 4 words (not an example sentence) + - The row was not already merged as phonetic + + Example: + Row 5: EN="to put up" DE="aufstellen" + Row 6: EN="with sth." DE="" + → Merged: EN="to put up with sth." DE="aufstellen" + """ + if len(entries) < 2: + return entries + + merged: List[Dict[str, Any]] = [] + for entry in entries: + en = (entry.get('english') or '').strip() + de = (entry.get('german') or '').strip() + + if merged and en and not de: + # Check: not phonetic (already handled) + if _is_phonetic_only_text(en): + merged.append(entry) + continue + + # Check: starts with lowercase + first_alpha = next((c for c in en if c.isalpha()), '') + starts_lower = first_alpha and first_alpha.islower() + + # Check: fewer than 4 words (not an example sentence) + word_count = len(en.split()) + is_short = word_count < 4 + + # Check: previous entry doesn't end with sentence terminator + prev = merged[-1] + prev_en = (prev.get('english') or '').strip() + prev_ends_sentence = prev_en and prev_en[-1] in '.!?' + + if starts_lower and is_short and not prev_ends_sentence: + # Merge into previous entry + prev['english'] = (prev_en + ' ' + en).strip() + # Merge example if present + ex = (entry.get('example') or '').strip() + if ex: + prev_ex = (prev.get('example') or '').strip() + prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex + logger.debug( + f"Merged continuation row {entry.get('row_index')} " + f"into previous entry: {prev['english']!r}" + ) + continue + + merged.append(entry) + + return merged + + def build_word_grid( ocr_img: np.ndarray, column_regions: List[PageRegion], @@ -3920,9 +4071,12 @@ def build_word_grid( # --- Post-processing pipeline (deterministic, no LLM) --- n_raw = len(entries) - # 0. Merge phonetic-only continuation rows into previous entry + # 0a. Merge phonetic-only continuation rows into previous entry entries = _merge_phonetic_continuation_rows(entries) + # 0b. Merge multi-line continuation rows (lowercase EN, empty DE) + entries = _merge_continuation_rows(entries) + # 1. Fix character confusion (I/1/l based on context) entries = _fix_character_confusion(entries) @@ -4361,7 +4515,7 @@ async def run_cv_pipeline( # Stage 3: Dewarp if enable_dewarp: t = time.time() - img = dewarp_image(img) + img, _dewarp_info = dewarp_image(img) result.stages['dewarp'] = round(time.time() - t, 2) # Stage 4: Dual image preparation diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index afb0a81..cba9b80 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1623,6 +1623,69 @@ async def save_reconstruction(session_id: str, request: Request): } +@router.post("/sessions/{session_id}/reprocess") +async def reprocess_session(session_id: str, request: Request): + """Re-run pipeline from a specific step, clearing downstream data. + + Body: {"from_step": 5} (1-indexed step number) + + Clears downstream results: + - from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result + - from_step <= 2: dewarp_result, column_result, row_result, word_result + - from_step <= 3: column_result, row_result, word_result + - from_step <= 4: row_result, word_result + - from_step <= 5: word_result (cells, vocab_entries) + - from_step <= 6: word_result.llm_review only + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + body = await request.json() + from_step = body.get("from_step", 1) + if not isinstance(from_step, int) or from_step < 1 or from_step > 7: + raise HTTPException(status_code=400, detail="from_step must be between 1 and 7") + + update_kwargs: Dict[str, Any] = {"current_step": from_step} + + # Clear downstream data based on from_step + if from_step <= 5: + update_kwargs["word_result"] = None + elif from_step == 6: + # Only clear LLM review from word_result + word_result = session.get("word_result") + if word_result: + word_result.pop("llm_review", None) + word_result.pop("llm_corrections", None) + update_kwargs["word_result"] = word_result + + if from_step <= 4: + update_kwargs["row_result"] = None + if from_step <= 3: + update_kwargs["column_result"] = None + if from_step <= 2: + update_kwargs["dewarp_result"] = None + if from_step <= 1: + update_kwargs["deskew_result"] = None + + await update_session_db(session_id, **update_kwargs) + + # Also clear cache + if session_id in _cache: + for key in list(update_kwargs.keys()): + if key != "current_step": + _cache[session_id][key] = update_kwargs[key] + _cache[session_id]["current_step"] = from_step + + logger.info(f"Session {session_id} reprocessing from step {from_step}") + + return { + "session_id": session_id, + "from_step": from_step, + "cleared": [k for k in update_kwargs if k != "current_step"], + } + + async def _get_rows_overlay(session_id: str) -> Response: """Generate dewarped image with row bands drawn on it.""" session = await get_session_db(session_id) diff --git a/klausur-service/backend/tests/test_cv_vocab_pipeline.py b/klausur-service/backend/tests/test_cv_vocab_pipeline.py index a1c77f1..4e17cd9 100644 --- a/klausur-service/backend/tests/test_cv_vocab_pipeline.py +++ b/klausur-service/backend/tests/test_cv_vocab_pipeline.py @@ -9,6 +9,9 @@ Tests cover: - Stage 5: Layout analysis (content bounds, projection profiles, column detection) - Stage 6: Multi-pass OCR region handling - Stage 7: Line grouping and vocabulary matching +- Noise filter functions (_is_noise_tail_token, _clean_cell_text) +- Phonetic detection (_is_phonetic_only_text) +- Phonetic & continuation row merging - Orchestrator (run_cv_pipeline) DSGVO Note: All tests run locally with synthetic data. No external API calls. @@ -36,6 +39,11 @@ from cv_vocab_pipeline import ( CV2_AVAILABLE, TESSERACT_AVAILABLE, CV_PIPELINE_AVAILABLE, + _is_noise_tail_token, + _clean_cell_text, + _is_phonetic_only_text, + _merge_phonetic_continuation_rows, + _merge_continuation_rows, ) @@ -202,16 +210,28 @@ class TestDeskew: @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestDewarp: - """Test dewarp (pass-through) stage.""" + """Test dewarp stage (returns (image, info) tuple).""" - def test_dewarp_passthrough(self, white_image): - """Current dewarp should return the same image (pass-through).""" + def test_dewarp_returns_tuple(self, white_image): + """dewarp_image must return (image, dewarp_info) tuple.""" result = dewarp_image(white_image) - np.testing.assert_array_equal(result, white_image) + assert isinstance(result, tuple) + assert len(result) == 2 + img_out, info = result + assert isinstance(img_out, np.ndarray) + assert isinstance(info, dict) + assert "shear_degrees" in info def test_dewarp_preserves_shape(self, text_like_image): - result = dewarp_image(text_like_image) - assert result.shape == text_like_image.shape + """Output image should have same shape as input.""" + img_out, _ = dewarp_image(text_like_image) + assert img_out.shape == text_like_image.shape + + def test_dewarp_white_image_no_correction(self, white_image): + """A uniform white image should get no shear correction.""" + img_out, info = dewarp_image(white_image) + assert abs(info["shear_degrees"]) < 0.5 + assert img_out.shape == white_image.shape # ============================================= @@ -561,6 +581,268 @@ class TestStageIntegration: assert layout_img.shape[:2] == corrected.shape[:2] +# ============================================= +# NOISE FILTER TESTS +# ============================================= + +class TestNoiseFilter: + """Test _is_noise_tail_token for trailing OCR noise detection.""" + + # --- Tokens that should be KEPT (return False) --- + + @pytest.mark.parametrize("token", [ + # Compound words with hyphens + "money-saver", + "under-", + "well-known", + # Words with parenthesized parts (dictionary entries) + "Schild(chen)", + "(Salat-)Gurke", + "(auf)", + "(on)", + "selbst)", + "(wir", + "Tanz(veranstaltung)", + "(zer)brechen", + # Phonetic brackets + "serva]", + "['mani", + "[eg]", + "[maus]", + # Words with trailing punctuation + "cupcakes.", + "sister.", + "mice", + # Abbreviations + "e.g.", + "sth.", + "usw.", + "adj.", + # Ellipsis + "...", + "\u2026", + # Regular words + "the", + "cat", + "big", + "run", + "set", + "ago", + ]) + def test_keep_real_tokens(self, token): + """Real words, dictionary punctuation, and phonetic brackets are kept.""" + assert _is_noise_tail_token(token) is False, f"Should keep {token!r}" + + # --- Tokens that should be FILTERED (return True) --- + + @pytest.mark.parametrize("token", [ + # Pure non-alpha + "B|", + "3d", + "x7", + ")", + "|", + "@", + "3", + # Very short non-dictionary fragments + "ee", + "k", + "zz", + "qq", + # Empty + "", + " ", + ]) + def test_filter_noise_tokens(self, token): + """OCR noise fragments are filtered.""" + assert _is_noise_tail_token(token) is True, f"Should filter {token!r}" + + +class TestCleanCellText: + """Test _clean_cell_text integration (full text → cleaned text).""" + + def test_empty_returns_empty(self): + assert _clean_cell_text("") == "" + assert _clean_cell_text(" ") == "" + + def test_real_word_unchanged(self): + assert _clean_cell_text("cupcakes") == "cupcakes" + + def test_strips_trailing_noise(self): + """Trailing noise tokens should be removed.""" + result = _clean_cell_text("cupcakes B|") + assert result == "cupcakes" + + def test_keeps_trailing_real_word(self): + """Trailing real words should be kept.""" + result = _clean_cell_text("big cat") + assert result == "big cat" + + def test_abbreviation_kept(self): + """Known abbreviations should not be cleared.""" + result = _clean_cell_text("e.g.") + assert result == "e.g." + + def test_pure_garbage_cleared(self): + """OCR garbage without real words should be cleared.""" + result = _clean_cell_text("3d |x") + assert result == "" + + def test_compound_word_preserved(self): + """Compound words with hyphens should be preserved.""" + result = _clean_cell_text("money-saver") + assert result == "money-saver" + + def test_parenthesized_word_preserved(self): + result = _clean_cell_text("(Salat-)Gurke") + assert result == "(Salat-)Gurke" + + def test_multiple_trailing_noise(self): + """Multiple trailing noise tokens should all be removed.""" + result = _clean_cell_text("achieve 3 |") + assert result == "achieve" + + +class TestPhoneticOnlyText: + """Test _is_phonetic_only_text for phonetic transcription detection.""" + + @pytest.mark.parametrize("text,expected", [ + # Phonetic-only patterns → True + ("['mani serva]", True), + ("[dɑːns]", True), + ("[\"a:mand]", True), + ("['wɜːkʃɒp]", True), + # serva] has 5 alpha chars after bracket removal → NOT phonetic-only + ("serva]", False), + # NOT phonetic-only → False + ("almond ['a:mand]", False), + ("Mandel", False), + ("cupcakes", False), + ("", False), + ("achieve", False), + ("money-saver ['mani]", False), + ]) + def test_phonetic_detection(self, text, expected): + assert _is_phonetic_only_text(text) is expected, \ + f"_is_phonetic_only_text({text!r}) should be {expected}" + + +class TestMergePhoneticContinuationRows: + """Test _merge_phonetic_continuation_rows for phonetic row merging.""" + + def test_empty_list(self): + assert _merge_phonetic_continuation_rows([]) == [] + + def test_single_entry(self): + entries = [{"english": "cat", "german": "Katze", "example": ""}] + result = _merge_phonetic_continuation_rows(entries) + assert len(result) == 1 + assert result[0]["english"] == "cat" + + def test_merges_phonetic_row(self): + """Phonetic-only row should merge into previous entry.""" + entries = [ + {"english": "money-saver", "german": "Sparfuchs", "example": "", "row_index": 0}, + {"english": "['mani serva]", "german": "", "example": "", "row_index": 1}, + ] + result = _merge_phonetic_continuation_rows(entries) + assert len(result) == 1 + assert result[0]["english"] == "money-saver ['mani serva]" + assert result[0]["german"] == "Sparfuchs" + + def test_no_merge_when_de_present(self): + """Row with DE text should NOT be merged even if EN looks phonetic.""" + entries = [ + {"english": "cat", "german": "Katze", "example": ""}, + {"english": "[kæt]", "german": "some text", "example": ""}, + ] + result = _merge_phonetic_continuation_rows(entries) + assert len(result) == 2 + + def test_no_merge_regular_rows(self): + """Normal vocab rows should not be merged.""" + entries = [ + {"english": "cat", "german": "Katze", "example": ""}, + {"english": "dog", "german": "Hund", "example": ""}, + ] + result = _merge_phonetic_continuation_rows(entries) + assert len(result) == 2 + + def test_merges_example_too(self): + """If phonetic row has example text, it should merge into previous.""" + entries = [ + {"english": "dance", "german": "tanzen", "example": "", "row_index": 0}, + {"english": "[dɑːns]", "german": "", "example": "Let's dance.", "row_index": 1}, + ] + result = _merge_phonetic_continuation_rows(entries) + assert len(result) == 1 + assert result[0]["english"] == "dance [dɑːns]" + assert result[0]["example"] == "Let's dance." + + +class TestMergeContinuationRows: + """Test _merge_continuation_rows for multi-line entry merging.""" + + def test_empty_list(self): + assert _merge_continuation_rows([]) == [] + + def test_no_merge_independent_rows(self): + """Rows with both EN and DE should not be merged.""" + entries = [ + {"english": "cat", "german": "Katze", "example": "", "row_index": 0}, + {"english": "dog", "german": "Hund", "example": "", "row_index": 1}, + ] + result = _merge_continuation_rows(entries) + assert len(result) == 2 + + def test_merge_lowercase_continuation(self): + """Lowercase EN with empty DE should merge into previous.""" + entries = [ + {"english": "to put up", "german": "aufstellen", "example": "", "row_index": 0}, + {"english": "with sth.", "german": "", "example": "", "row_index": 1}, + ] + result = _merge_continuation_rows(entries) + assert len(result) == 1 + assert result[0]["english"] == "to put up with sth." + assert result[0]["german"] == "aufstellen" + + def test_no_merge_uppercase_start(self): + """EN starting with uppercase and empty DE is likely its own entry, not a continuation.""" + entries = [ + {"english": "cat", "german": "Katze", "example": "", "row_index": 0}, + {"english": "Dog", "german": "", "example": "", "row_index": 1}, + ] + result = _merge_continuation_rows(entries) + assert len(result) == 2 + + def test_no_merge_when_previous_ends_with_period(self): + """If previous entry ends with sentence terminator, next is not continuation.""" + entries = [ + {"english": "That's great.", "german": "Das ist toll.", "example": "", "row_index": 0}, + {"english": "really nice", "german": "", "example": "", "row_index": 1}, + ] + result = _merge_continuation_rows(entries) + assert len(result) == 2 + + def test_no_merge_long_text(self): + """Text with 4+ words is likely an example sentence, not continuation.""" + entries = [ + {"english": "achieve", "german": "erreichen", "example": "", "row_index": 0}, + {"english": "she achieved her goals", "german": "", "example": "", "row_index": 1}, + ] + result = _merge_continuation_rows(entries) + assert len(result) == 2 + + def test_first_entry_not_merged(self): + """First entry with empty DE should not crash (no previous).""" + entries = [ + {"english": "something", "german": "", "example": "", "row_index": 0}, + {"english": "cat", "german": "Katze", "example": "", "row_index": 1}, + ] + result = _merge_continuation_rows(entries) + assert len(result) == 2 + + # ============================================= # RUN TESTS # =============================================