fix(ocr-pipeline): split oversized cells before OCR to capture all text

For cells taller than 1.5× median row height, split vertically into sub-cells and OCR each separately. This fixes RapidOCR losing text at the bottom of tall cells (e.g. "floor/Fußboden" below "egg/Ei" in a merged row). Generic fix — works for any oversized cell. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 11:32:10 +01:00
parent 854d8b431b
commit 8507e2e035
1 changed files with 37 additions and 11 deletions
@@ -2885,6 +2885,10 @@ def build_word_grid(

    entries: List[Dict[str, Any]] = []

+    # Calculate median row height for oversized detection
+    row_heights = sorted(r.height for r in content_rows)
+    median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
+
    for row_idx, row in enumerate(content_rows):
        entry: Dict[str, Any] = {
            'row_index': row_idx,
@@ -2926,18 +2930,40 @@ def build_word_grid(
            if cell_w <= 0 or cell_h <= 0:
                continue

-            cell_region = PageRegion(
-                type=col.type,
-                x=cell_x, y=cell_y,
-                width=cell_w, height=cell_h,
-            )
-
-            # OCR the cell
-            if use_rapid:
-                words = ocr_region_rapid(img_bgr, cell_region)
+            # For oversized cells (>1.5× median), split vertically into sub-cells
+            # and OCR each separately. This prevents OCR from missing text at
+            # the bottom of tall cells (RapidOCR downscales tall narrow crops).
+            is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
+            if is_oversized:
+                n_splits = max(2, round(row.height / median_row_h))
+                sub_h = cell_h / n_splits
+                words = []
+                for s in range(n_splits):
+                    sub_y = int(cell_y + s * sub_h)
+                    sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
+                    sub_region = PageRegion(
+                        type=col.type,
+                        x=cell_x, y=sub_y,
+                        width=cell_w, height=max(1, sub_height),
+                    )
+                    if use_rapid:
+                        sub_words = ocr_region_rapid(img_bgr, sub_region)
+                    else:
+                        cell_lang = lang_map.get(col.type, lang)
+                        sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
+                    words.extend(sub_words)
            else:
-                cell_lang = lang_map.get(col.type, lang)
-                words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
+                cell_region = PageRegion(
+                    type=col.type,
+                    x=cell_x, y=cell_y,
+                    width=cell_w, height=cell_h,
+                )
+                # OCR the cell
+                if use_rapid:
+                    words = ocr_region_rapid(img_bgr, cell_region)
+                else:
+                    cell_lang = lang_map.get(col.type, lang)
+                    words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)

            # Group into lines, then join in reading order (Fix A)
            # Use half of average word height as Y-tolerance