From 8507e2e035dbec247ccc3d57ff346c6a7e47ba4c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 1 Mar 2026 11:32:10 +0100 Subject: [PATCH] fix(ocr-pipeline): split oversized cells before OCR to capture all text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For cells taller than 1.5× median row height, split vertically into sub-cells and OCR each separately. This fixes RapidOCR losing text at the bottom of tall cells (e.g. "floor/Fußboden" below "egg/Ei" in a merged row). Generic fix — works for any oversized cell. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 48 +++++++++++++++----- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index c2da3d1..ea283b7 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2885,6 +2885,10 @@ def build_word_grid( entries: List[Dict[str, Any]] = [] + # Calculate median row height for oversized detection + row_heights = sorted(r.height for r in content_rows) + median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100 + for row_idx, row in enumerate(content_rows): entry: Dict[str, Any] = { 'row_index': row_idx, @@ -2926,18 +2930,40 @@ def build_word_grid( if cell_w <= 0 or cell_h <= 0: continue - cell_region = PageRegion( - type=col.type, - x=cell_x, y=cell_y, - width=cell_w, height=cell_h, - ) - - # OCR the cell - if use_rapid: - words = ocr_region_rapid(img_bgr, cell_region) + # For oversized cells (>1.5× median), split vertically into sub-cells + # and OCR each separately. This prevents OCR from missing text at + # the bottom of tall cells (RapidOCR downscales tall narrow crops). + is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20 + if is_oversized: + n_splits = max(2, round(row.height / median_row_h)) + sub_h = cell_h / n_splits + words = [] + for s in range(n_splits): + sub_y = int(cell_y + s * sub_h) + sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y) + sub_region = PageRegion( + type=col.type, + x=cell_x, y=sub_y, + width=cell_w, height=max(1, sub_height), + ) + if use_rapid: + sub_words = ocr_region_rapid(img_bgr, sub_region) + else: + cell_lang = lang_map.get(col.type, lang) + sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6) + words.extend(sub_words) else: - cell_lang = lang_map.get(col.type, lang) - words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) + cell_region = PageRegion( + type=col.type, + x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + # OCR the cell + if use_rapid: + words = ocr_region_rapid(img_bgr, cell_region) + else: + cell_lang = lang_map.get(col.type, lang) + words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) # Group into lines, then join in reading order (Fix A) # Use half of average word height as Y-tolerance