From 6bca3370e0deabe916f1fca2a191850d0d07b215 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 09:16:50 +0100
Subject: [PATCH] fix(ocr-pipeline): fix vocab post-processing destroying
 correct cell results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three bugs in the post-processing pipeline were overwriting correct
streaming results with wrong ones:

1. _split_comma_entries was splitting "Maus, Mäuse" into two separate
   entries. Disabled — word forms belong together.

2. _attach_example_sentences treated "Ei" (2 chars) as OCR noise due
   to `len(de) > 2` threshold. Lowered to `len(de) > 1`.

3. _attach_example_sentences wrongly classified rows with EN text but
   no DE (like "stand ...") as example sentences, merging them into
   the previous entry. Now only treats rows as examples if they also
   have no text in the example column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 16 +++++++++++-----
 klausur-service/backend/ocr_pipeline_api.py  |  8 ++++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 50e0a4a..eea7a64 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -2868,15 +2868,21 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
         de = (entry.get('german', '') or '').strip()
         ex = (entry.get('example', '') or '').strip()
 
-        # Treat very short DE (≤2 chars) as OCR noise, not real translation
-        has_de = len(de) > 2
+        # Treat single-char DE as OCR noise, not real translation.
+        # "Ei" (2 chars) is a valid German word, so threshold is 1.
+        has_de = len(de) > 1
         has_en = bool(en)
+        has_ex = bool(ex)
 
-        if has_en and not has_de and vocab_entries:
+        # A row is an example candidate ONLY if it has EN text but
+        # NO DE translation AND NO example-column text.  Rows with
+        # text in the example column are real vocab entries (e.g.
+        # continuation lines like "stand ..." / "German: Ich möchte...").
+        is_example_candidate = has_en and not has_de and not has_ex and vocab_entries
+
+        if is_example_candidate:
             # This is an example sentence — find best matching vocab entry
             example_text = en
-            if ex:
-                example_text = f"{en} — {ex}"
 
             match_idx = _find_best_vocab_match(en, vocab_entries)
             if match_idx < 0:
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index 24f1d4a..e900e9c 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1179,7 +1179,9 @@ async def detect_words(
         entries = _cells_to_vocab_entries(cells, columns_meta)
         entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-        entries = _split_comma_entries(entries)
+        # NOTE: _split_comma_entries disabled — word forms like "mouse, mice"
+        # / "Maus, Mäuse" belong together in one entry.
+        # entries = _split_comma_entries(entries)
         entries = _attach_example_sentences(entries)
         word_result["vocab_entries"] = entries
         # Also keep "entries" key for backwards compatibility
@@ -1308,7 +1310,9 @@ async def _word_stream_generator(
         entries = _cells_to_vocab_entries(all_cells, columns_meta)
         entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-        entries = _split_comma_entries(entries)
+        # NOTE: _split_comma_entries disabled — word forms like "mouse, mice"
+        # / "Maus, Mäuse" belong together in one entry.
+        # entries = _split_comma_entries(entries)
         entries = _attach_example_sentences(entries)
         word_result["vocab_entries"] = entries
         word_result["entries"] = entries