fix(ocr-pipeline): split oversized cells before OCR to capture all text

For cells taller than 1.5× median row height, split vertically into
sub-cells and OCR each separately. This fixes RapidOCR losing text
at the bottom of tall cells (e.g. "floor/Fußboden" below "egg/Ei"
in a merged row). Generic fix — works for any oversized cell.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-01 11:32:10 +01:00
parent 854d8b431b
commit 8507e2e035

View File

@@ -2885,6 +2885,10 @@ def build_word_grid(
entries: List[Dict[str, Any]] = []
# Calculate median row height for oversized detection
row_heights = sorted(r.height for r in content_rows)
median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
for row_idx, row in enumerate(content_rows):
entry: Dict[str, Any] = {
'row_index': row_idx,
@@ -2926,18 +2930,40 @@ def build_word_grid(
if cell_w <= 0 or cell_h <= 0:
continue
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
# OCR the cell
if use_rapid:
words = ocr_region_rapid(img_bgr, cell_region)
# For oversized cells (>1.5× median), split vertically into sub-cells
# and OCR each separately. This prevents OCR from missing text at
# the bottom of tall cells (RapidOCR downscales tall narrow crops).
is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
if is_oversized:
n_splits = max(2, round(row.height / median_row_h))
sub_h = cell_h / n_splits
words = []
for s in range(n_splits):
sub_y = int(cell_y + s * sub_h)
sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
sub_region = PageRegion(
type=col.type,
x=cell_x, y=sub_y,
width=cell_w, height=max(1, sub_height),
)
if use_rapid:
sub_words = ocr_region_rapid(img_bgr, sub_region)
else:
cell_lang = lang_map.get(col.type, lang)
sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
words.extend(sub_words)
else:
cell_lang = lang_map.get(col.type, lang)
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
# OCR the cell
if use_rapid:
words = ocr_region_rapid(img_bgr, cell_region)
else:
cell_lang = lang_map.get(col.type, lang)
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
# Group into lines, then join in reading order (Fix A)
# Use half of average word height as Y-tolerance