fix(ocr-pipeline): split oversized cells before OCR to capture all text
For cells taller than 1.5× median row height, split vertically into sub-cells and OCR each separately. This fixes RapidOCR losing text at the bottom of tall cells (e.g. "floor/Fußboden" below "egg/Ei" in a merged row). Generic fix — works for any oversized cell. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2885,6 +2885,10 @@ def build_word_grid(
|
||||
|
||||
entries: List[Dict[str, Any]] = []
|
||||
|
||||
# Calculate median row height for oversized detection
|
||||
row_heights = sorted(r.height for r in content_rows)
|
||||
median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
entry: Dict[str, Any] = {
|
||||
'row_index': row_idx,
|
||||
@@ -2926,18 +2930,40 @@ def build_word_grid(
|
||||
if cell_w <= 0 or cell_h <= 0:
|
||||
continue
|
||||
|
||||
cell_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
|
||||
# OCR the cell
|
||||
if use_rapid:
|
||||
words = ocr_region_rapid(img_bgr, cell_region)
|
||||
# For oversized cells (>1.5× median), split vertically into sub-cells
|
||||
# and OCR each separately. This prevents OCR from missing text at
|
||||
# the bottom of tall cells (RapidOCR downscales tall narrow crops).
|
||||
is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
|
||||
if is_oversized:
|
||||
n_splits = max(2, round(row.height / median_row_h))
|
||||
sub_h = cell_h / n_splits
|
||||
words = []
|
||||
for s in range(n_splits):
|
||||
sub_y = int(cell_y + s * sub_h)
|
||||
sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
|
||||
sub_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=sub_y,
|
||||
width=cell_w, height=max(1, sub_height),
|
||||
)
|
||||
if use_rapid:
|
||||
sub_words = ocr_region_rapid(img_bgr, sub_region)
|
||||
else:
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
|
||||
words.extend(sub_words)
|
||||
else:
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
cell_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
# OCR the cell
|
||||
if use_rapid:
|
||||
words = ocr_region_rapid(img_bgr, cell_region)
|
||||
else:
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
|
||||
# Group into lines, then join in reading order (Fix A)
|
||||
# Use half of average word height as Y-tolerance
|
||||
|
||||
Reference in New Issue
Block a user