diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index c2da3d1..ea283b7 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2885,6 +2885,10 @@ def build_word_grid( entries: List[Dict[str, Any]] = [] + # Calculate median row height for oversized detection + row_heights = sorted(r.height for r in content_rows) + median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100 + for row_idx, row in enumerate(content_rows): entry: Dict[str, Any] = { 'row_index': row_idx, @@ -2926,18 +2930,40 @@ def build_word_grid( if cell_w <= 0 or cell_h <= 0: continue - cell_region = PageRegion( - type=col.type, - x=cell_x, y=cell_y, - width=cell_w, height=cell_h, - ) - - # OCR the cell - if use_rapid: - words = ocr_region_rapid(img_bgr, cell_region) + # For oversized cells (>1.5× median), split vertically into sub-cells + # and OCR each separately. This prevents OCR from missing text at + # the bottom of tall cells (RapidOCR downscales tall narrow crops). + is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20 + if is_oversized: + n_splits = max(2, round(row.height / median_row_h)) + sub_h = cell_h / n_splits + words = [] + for s in range(n_splits): + sub_y = int(cell_y + s * sub_h) + sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y) + sub_region = PageRegion( + type=col.type, + x=cell_x, y=sub_y, + width=cell_w, height=max(1, sub_height), + ) + if use_rapid: + sub_words = ocr_region_rapid(img_bgr, sub_region) + else: + cell_lang = lang_map.get(col.type, lang) + sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6) + words.extend(sub_words) else: - cell_lang = lang_map.get(col.type, lang) - words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) + cell_region = PageRegion( + type=col.type, + x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + # OCR the cell + if use_rapid: + words = ocr_region_rapid(img_bgr, cell_region) + else: + cell_lang = lang_map.get(col.type, lang) + words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6) # Group into lines, then join in reading order (Fix A) # Use half of average word height as Y-tolerance