From 72cc77dcf4c0c8c611f1a78578600120ec7d98b9 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 09:41:30 +0100 Subject: [PATCH] fix(ocr-pipeline): cells = result, no post-processing content shuffling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cell grid IS the result. Each cell stays at its detected position. Removed _split_comma_entries and _attach_example_sentences from the pipeline — they were shuffling content between rows/columns, causing "Mäuse" to appear in a separate row, "stand..." to move to Example, and "Ei" to disappear. Now: cells → _cells_to_vocab_entries (1:1 row mapping) → _fix_character_confusion → _fix_phonetic_brackets → done. Also lowered pixel-density threshold from 2% to 0.5% for the cell-OCR fallback so small text like "Ei" is not filtered out. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 6 ++++-- klausur-service/backend/ocr_pipeline_api.py | 13 ++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 87353be..cedd4fa 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3186,9 +3186,11 @@ def _ocr_single_cell( if ocr_img is not None: crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] if crop.size > 0: - # Threshold: pixels darker than 180 (on 0-255 grayscale) + # Threshold: pixels darker than 180 (on 0-255 grayscale). + # Use 0.5% to catch even small text like "Ei" (2 chars) + # in an otherwise empty cell. dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size - _run_fallback = dark_ratio > 0.02 + _run_fallback = dark_ratio > 0.005 if _run_fallback: cell_region = PageRegion( type=col.type, diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 24f1d4a..b0a95fc 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -35,8 +35,6 @@ from cv_vocab_pipeline import ( _cells_to_vocab_entries, _fix_character_confusion, _fix_phonetic_brackets, - _split_comma_entries, - _attach_example_sentences, analyze_layout, analyze_layout_by_words, build_cell_grid, @@ -1174,15 +1172,13 @@ async def detect_words( }, } - # For vocab layout: add post-processed vocab_entries (backwards compat) + # For vocab layout: map cells 1:1 to vocab entries (row→entry). + # No content shuffling — each cell stays at its detected position. if is_vocab: entries = _cells_to_vocab_entries(cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - entries = _split_comma_entries(entries) - entries = _attach_example_sentences(entries) word_result["vocab_entries"] = entries - # Also keep "entries" key for backwards compatibility word_result["entries"] = entries word_result["entry_count"] = len(entries) word_result["summary"]["total_entries"] = len(entries) @@ -1302,14 +1298,13 @@ async def _word_stream_generator( }, } - # Vocab post-processing + # For vocab layout: map cells 1:1 to vocab entries (row→entry). + # No content shuffling — each cell stays at its detected position. vocab_entries = None if is_vocab: entries = _cells_to_vocab_entries(all_cells, columns_meta) entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - entries = _split_comma_entries(entries) - entries = _attach_example_sentences(entries) word_result["vocab_entries"] = entries word_result["entries"] = entries word_result["entry_count"] = len(entries)