From e3f939a6282f5dd5e53cd66ffbd02dd295e83793 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 09:27:30 +0100
Subject: [PATCH] refactor(ocr-pipeline): make post-processing fully generic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three non-generic solutions replaced with universal heuristics:

1. Cell-OCR fallback: instead of restricting to column_en/column_de,
   now checks pixel density (>2% dark pixels) for ANY column type.
   Truly empty cells are skipped without running Tesseract.

2. Example-sentence detection: instead of checking for example-column
   text (worksheet-specific), now uses sentence heuristics (>=4 words
   or ends with sentence punctuation). Short EN text without DE is
   kept as a vocab entry (OCR may have missed the translation).

3. Comma-split: re-enabled with singular/plural detection. Pairs like
   "mouse, mice" / "Maus, Mäuse" are kept together. Verb forms like
   "break, broke, broken" are still split into individual entries.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 91 ++++++++++++++++----
 klausur-service/backend/ocr_pipeline_api.py  |  8 +-
 2 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index eea7a64..87353be 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -2713,14 +2713,54 @@ def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, An
 
 # --- B. Comma-Separated Word Form Splitting ---
 
+def _is_singular_plural_pair(parts: List[str]) -> bool:
+    """Detect if comma-separated parts are singular/plural forms of the same word.
+
+    E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
+    "break, broke, broken" → False (different verb forms, OK to split).
+
+    Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
+    OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
+    """
+    if len(parts) != 2:
+        return False
+
+    a, b = parts[0].lower().strip(), parts[1].lower().strip()
+    if not a or not b:
+        return False
+
+    # Common prefix heuristic: if words share >= 50% of the shorter word,
+    # they are likely forms of the same word (Maus/Mäuse, child/children).
+    min_len = min(len(a), len(b))
+    common = 0
+    for ca, cb in zip(a, b):
+        if ca == cb:
+            common += 1
+        else:
+            break
+    if common >= max(2, min_len * 0.5):
+        return True
+
+    # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
+    umlaut_map = str.maketrans('aou', 'äöü')
+    if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
+        return True
+
+    return False
+
+
 def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Split entries with comma-separated word forms into individual entries.
 
     E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
     → 3 entries: break/brechen, broke/brach, broken/gebrochen
 
+    Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
+    because those are forms of the same vocabulary entry.
+
     Only splits when both EN and DE have the same number of comma-parts,
-    or when one side has multiple and the other has exactly one.
+    parts are short (word forms, not sentences), and at least 3 parts
+    (to avoid splitting pairs that likely belong together).
     """
     result: List[Dict[str, Any]] = []
 
@@ -2732,13 +2772,17 @@ def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         en_parts = _split_by_comma(en)
         de_parts = _split_by_comma(de)
 
-        # Only split if we have multiple parts and counts match or one side is single
+        # Only split if we have multiple parts and counts match
         should_split = False
         if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
-            # Both have same count — each part is a word form
-            # But only if parts are short (word forms, not sentences)
+            # All parts must be short (word forms, not sentences)
             if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
-                should_split = True
+                # Do NOT split singular/plural pairs (2 parts that are
+                # forms of the same word)
+                if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
+                    should_split = False
+                else:
+                    should_split = True
 
         if not should_split:
             result.append(entry)
@@ -2872,13 +2916,18 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
         # "Ei" (2 chars) is a valid German word, so threshold is 1.
         has_de = len(de) > 1
         has_en = bool(en)
-        has_ex = bool(ex)
 
-        # A row is an example candidate ONLY if it has EN text but
-        # NO DE translation AND NO example-column text.  Rows with
-        # text in the example column are real vocab entries (e.g.
-        # continuation lines like "stand ..." / "German: Ich möchte...").
-        is_example_candidate = has_en and not has_de and not has_ex and vocab_entries
+        # Heuristic: a row without DE is an "example sentence" only if
+        # the EN text looks like a sentence (>= 4 words, or contains
+        # typical sentence punctuation).  Short EN text (1-3 words) is
+        # more likely a vocab entry whose DE was missed by OCR.
+        _looks_like_sentence = (
+            len(en.split()) >= 4
+            or en.rstrip().endswith(('.', '!', '?'))
+        )
+        is_example_candidate = (
+            has_en and not has_de and _looks_like_sentence and vocab_entries
+        )
 
         if is_example_candidate:
             # This is an example sentence — find best matching vocab entry
@@ -3127,12 +3176,20 @@ def _ocr_single_cell(
     # --- FALLBACK: Cell-OCR for empty cells ---
     # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
     # Re-run OCR on the cell crop to catch what word-lookup missed.
-    # Only run fallback for EN/DE columns (where vocab words are expected).
-    # Example columns are often legitimately empty and running Tesseract on
-    # all of them wastes ~10s.  column_example cells stay empty if word-lookup
-    # found nothing.
-    _fallback_col_types = {'column_en', 'column_de'}
-    if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types:
+    # To avoid wasting time on truly empty cells, check pixel density first:
+    # only run Tesseract if the cell crop contains enough dark pixels to
+    # plausibly contain text.
+    _run_fallback = False
+    if not text.strip() and cell_w > 0 and cell_h > 0:
+        # Quick pixel-density check: binarise the cell crop and count
+        # dark pixels.  Text cells typically have >2% ink coverage.
+        if ocr_img is not None:
+            crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+            if crop.size > 0:
+                # Threshold: pixels darker than 180 (on 0-255 grayscale)
+                dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+                _run_fallback = dark_ratio > 0.02
+    if _run_fallback:
         cell_region = PageRegion(
             type=col.type,
             x=cell_x, y=cell_y,
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index e900e9c..24f1d4a 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1179,9 +1179,7 @@ async def detect_words(
         entries = _cells_to_vocab_entries(cells, columns_meta)
         entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-        # NOTE: _split_comma_entries disabled — word forms like "mouse, mice"
-        # / "Maus, Mäuse" belong together in one entry.
-        # entries = _split_comma_entries(entries)
+        entries = _split_comma_entries(entries)
         entries = _attach_example_sentences(entries)
         word_result["vocab_entries"] = entries
         # Also keep "entries" key for backwards compatibility
@@ -1310,9 +1308,7 @@ async def _word_stream_generator(
         entries = _cells_to_vocab_entries(all_cells, columns_meta)
         entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-        # NOTE: _split_comma_entries disabled — word forms like "mouse, mice"
-        # / "Maus, Mäuse" belong together in one entry.
-        # entries = _split_comma_entries(entries)
+        entries = _split_comma_entries(entries)
         entries = _attach_example_sentences(entries)
         word_result["vocab_entries"] = entries
         word_result["entries"] = entries