From 72cc77dcf4c0c8c611f1a78578600120ec7d98b9 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 09:41:30 +0100
Subject: [PATCH] fix(ocr-pipeline): cells = result, no post-processing content
 shuffling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cell grid IS the result. Each cell stays at its detected position.

Removed _split_comma_entries and _attach_example_sentences from the
pipeline — they were shuffling content between rows/columns, causing
"Mäuse" to appear in a separate row, "stand..." to move to Example,
and "Ei" to disappear.

Now: cells → _cells_to_vocab_entries (1:1 row mapping) →
_fix_character_confusion → _fix_phonetic_brackets → done.

Also lowered pixel-density threshold from 2% to 0.5% for the cell-OCR
fallback so small text like "Ei" is not filtered out.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py |  6 ++++--
 klausur-service/backend/ocr_pipeline_api.py  | 13 ++++---------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 87353be..cedd4fa 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3186,9 +3186,11 @@ def _ocr_single_cell(
         if ocr_img is not None:
             crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
             if crop.size > 0:
-                # Threshold: pixels darker than 180 (on 0-255 grayscale)
+                # Threshold: pixels darker than 180 (on 0-255 grayscale).
+                # Use 0.5% to catch even small text like "Ei" (2 chars)
+                # in an otherwise empty cell.
                 dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
-                _run_fallback = dark_ratio > 0.02
+                _run_fallback = dark_ratio > 0.005
     if _run_fallback:
         cell_region = PageRegion(
             type=col.type,
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index 24f1d4a..b0a95fc 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -35,8 +35,6 @@ from cv_vocab_pipeline import (
     _cells_to_vocab_entries,
     _fix_character_confusion,
     _fix_phonetic_brackets,
-    _split_comma_entries,
-    _attach_example_sentences,
     analyze_layout,
     analyze_layout_by_words,
     build_cell_grid,
@@ -1174,15 +1172,13 @@ async def detect_words(
         },
     }
 
-    # For vocab layout: add post-processed vocab_entries (backwards compat)
+    # For vocab layout: map cells 1:1 to vocab entries (row→entry).
+    # No content shuffling — each cell stays at its detected position.
     if is_vocab:
         entries = _cells_to_vocab_entries(cells, columns_meta)
         entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-        entries = _split_comma_entries(entries)
-        entries = _attach_example_sentences(entries)
         word_result["vocab_entries"] = entries
-        # Also keep "entries" key for backwards compatibility
         word_result["entries"] = entries
         word_result["entry_count"] = len(entries)
         word_result["summary"]["total_entries"] = len(entries)
@@ -1302,14 +1298,13 @@ async def _word_stream_generator(
         },
     }
 
-    # Vocab post-processing
+    # For vocab layout: map cells 1:1 to vocab entries (row→entry).
+    # No content shuffling — each cell stays at its detected position.
     vocab_entries = None
     if is_vocab:
         entries = _cells_to_vocab_entries(all_cells, columns_meta)
         entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-        entries = _split_comma_entries(entries)
-        entries = _attach_example_sentences(entries)
         word_result["vocab_entries"] = entries
         word_result["entries"] = entries
         word_result["entry_count"] = len(entries)