From 77869e32f4076866211d681333e160b7b633d180 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 07:24:46 +0100
Subject: [PATCH] feat(ocr-pipeline): use word-lookup instead of cell-OCR for
 cell grid

Replace per-cell Tesseract re-runs with lookup of pre-existing full-page
words from row.words. Words are filtered by X-overlap with column bounds.
This fixes phantom rows with garbage text, missing last words, and
incomplete example text by using the more reliable full-page OCR results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 75 ++++++++++++++------
 1 file changed, 53 insertions(+), 22 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index e3b88a7..23dceb7 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3009,6 +3009,48 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
     return _PHONETIC_BRACKET_RE.sub(replacer, text)
 
 
+def _lookup_cell_words(
+    row: RowGeometry,
+    col: PageRegion,
+    pad: int = 8,
+) -> Tuple[List[Dict], float]:
+    """Look up pre-existing Tesseract words that fall within a cell region.
+
+    Instead of re-running OCR on a cell crop, this filters the full-page
+    Tesseract words (stored in row.words) by X-overlap with the column.
+
+    Words use coordinates relative to the content ROI; columns use absolute
+    coordinates.  row.x equals the content-ROI left_x, so we convert with:
+        col_left_rel = col.x - row.x
+
+    Returns:
+        (words_in_cell, avg_confidence) where words_in_cell keep their
+        original relative coordinates (compatible with
+        _words_to_reading_order_text).
+    """
+    if not row.words:
+        return [], 0.0
+
+    left_x = row.x  # content ROI offset (absolute)
+    col_left_rel = col.x - left_x - pad
+    col_right_rel = col.x - left_x + col.width + pad
+
+    words_in_cell = []
+    for w in row.words:
+        w_left = w['left']
+        w_right = w_left + w['width']
+        # Word center must be within column bounds
+        w_center_x = (w_left + w_right) / 2
+        if col_left_rel <= w_center_x <= col_right_rel:
+            words_in_cell.append(w)
+
+    avg_conf = 0.0
+    if words_in_cell:
+        avg_conf = round(sum(w['conf'] for w in words_in_cell) / len(words_in_cell), 1)
+
+    return words_in_cell, avg_conf
+
+
 def _ocr_single_cell(
     row_idx: int,
     col_idx: int,
@@ -3023,7 +3065,7 @@ def _ocr_single_cell(
     lang: str,
     lang_map: Dict[str, str],
 ) -> Dict[str, Any]:
-    """OCR a single cell (column × row intersection) and return its dict."""
+    """Populate a single cell (column x row intersection) via word lookup."""
     pad = 8  # pixels
     cell_x = max(0, col.x - pad)
     cell_y = max(0, row.y - pad)
@@ -3051,33 +3093,22 @@ def _ocr_single_cell(
                 'w': round(col.width / img_w * 100, 2),
                 'h': round(row.height / img_h * 100, 2),
             },
-            'ocr_engine': engine_name,
+            'ocr_engine': 'word_lookup',
         }
 
-    cell_region = PageRegion(
-        type=col.type,
-        x=cell_x, y=cell_y,
-        width=cell_w, height=cell_h,
-    )
+    # --- PRIMARY: Word-lookup from full-page Tesseract ---
+    # Use pre-existing words from row.words (Step 4) instead of
+    # re-running OCR on a small crop.  This is more reliable because
+    # full-page Tesseract has better context for recognition.
+    words, avg_conf = _lookup_cell_words(row, col, pad=pad)
 
-    # OCR the cell
-    if use_rapid:
-        words = ocr_region_rapid(img_bgr, cell_region)
-    else:
-        cell_lang = lang_map.get(col.type, lang)
-        words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
-
-    # Group into lines, then join in reading order
     if words:
         avg_h = sum(w['height'] for w in words) / len(words)
         y_tol = max(10, int(avg_h * 0.5))
+        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
     else:
-        y_tol = 15
-    text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
-
-    avg_conf = 0.0
-    if words:
-        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+        text = ''
+        avg_conf = 0.0
 
     return {
         'cell_id': f"R{row_idx:02d}_C{col_idx}",
@@ -3093,7 +3124,7 @@ def _ocr_single_cell(
             'w': round(cell_w / img_w * 100, 2),
             'h': round(cell_h / img_h * 100, 2),
         },
-        'ocr_engine': engine_name,
+        'ocr_engine': 'word_lookup',
     }