From 8507e2e035dbec247ccc3d57ff346c6a7e47ba4c Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Sun, 1 Mar 2026 11:32:10 +0100
Subject: [PATCH] fix(ocr-pipeline): split oversized cells before OCR to
 capture all text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For cells taller than 1.5× median row height, split vertically into
sub-cells and OCR each separately. This fixes RapidOCR losing text
at the bottom of tall cells (e.g. "floor/Fußboden" below "egg/Ei"
in a merged row). Generic fix — works for any oversized cell.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 48 +++++++++++++++-----
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index c2da3d1..ea283b7 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -2885,6 +2885,10 @@ def build_word_grid(
 
     entries: List[Dict[str, Any]] = []
 
+    # Calculate median row height for oversized detection
+    row_heights = sorted(r.height for r in content_rows)
+    median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
+
     for row_idx, row in enumerate(content_rows):
         entry: Dict[str, Any] = {
             'row_index': row_idx,
@@ -2926,18 +2930,40 @@ def build_word_grid(
             if cell_w <= 0 or cell_h <= 0:
                 continue
 
-            cell_region = PageRegion(
-                type=col.type,
-                x=cell_x, y=cell_y,
-                width=cell_w, height=cell_h,
-            )
-
-            # OCR the cell
-            if use_rapid:
-                words = ocr_region_rapid(img_bgr, cell_region)
+            # For oversized cells (>1.5× median), split vertically into sub-cells
+            # and OCR each separately. This prevents OCR from missing text at
+            # the bottom of tall cells (RapidOCR downscales tall narrow crops).
+            is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
+            if is_oversized:
+                n_splits = max(2, round(row.height / median_row_h))
+                sub_h = cell_h / n_splits
+                words = []
+                for s in range(n_splits):
+                    sub_y = int(cell_y + s * sub_h)
+                    sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
+                    sub_region = PageRegion(
+                        type=col.type,
+                        x=cell_x, y=sub_y,
+                        width=cell_w, height=max(1, sub_height),
+                    )
+                    if use_rapid:
+                        sub_words = ocr_region_rapid(img_bgr, sub_region)
+                    else:
+                        cell_lang = lang_map.get(col.type, lang)
+                        sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
+                    words.extend(sub_words)
             else:
-                cell_lang = lang_map.get(col.type, lang)
-                words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
+                cell_region = PageRegion(
+                    type=col.type,
+                    x=cell_x, y=cell_y,
+                    width=cell_w, height=cell_h,
+                )
+                # OCR the cell
+                if use_rapid:
+                    words = ocr_region_rapid(img_bgr, cell_region)
+                else:
+                    cell_lang = lang_map.get(col.type, lang)
+                    words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
 
             # Group into lines, then join in reading order (Fix A)
             # Use half of average word height as Y-tolerance