feat(ocr-pipeline): move oversized row splitting from Step 5 to Step 4

Implement _split_oversized_rows() in detect_row_geometry() (Step 7) to
split content rows >1.5× median height using local horizontal projection.
This produces correctly-sized rows before word OCR runs, instead of
working around the issue in Step 5 with sub-cell splitting hacks.

Removed Step 5 workarounds: _split_oversized_entries(), sub-cell
splitting in build_word_grid(), and median_row_h calculation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-01 11:46:18 +01:00
parent 8507e2e035
commit ba65e47654

View File

@@ -1539,6 +1539,11 @@ def detect_row_geometry(
gap_before=gap_before,
))
# --- Step 7: Split oversized rows ---
# If a content row is >1.5× the median height, re-analyze it with a local
# horizontal projection to find missed row boundaries within.
rows = _split_oversized_rows(rows, inv, left_x, right_x, top_y, word_dicts)
type_counts = {}
for r in rows:
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
@@ -1547,6 +1552,176 @@ def detect_row_geometry(
return rows
def _split_oversized_rows(
rows: List['RowGeometry'],
inv: np.ndarray,
left_x: int, right_x: int,
top_y: int,
word_dicts: List[Dict],
) -> List['RowGeometry']:
"""Split content rows that are >1.5× the median height.
Re-analyses oversized rows with a local horizontal projection profile
to find missed row boundaries within. This catches cases where Step 4's
initial gap analysis merged multiple vocabulary lines (e.g. because an
image or dense text obscured the gap).
Returns a new list with oversized rows replaced by sub-rows, re-indexed.
"""
content_rows = [r for r in rows if r.row_type == 'content']
if len(content_rows) < 3:
return rows
heights = sorted(r.height for r in content_rows)
median_h = heights[len(heights) // 2]
if median_h <= 10:
return rows
threshold = median_h * 1.5
content_w = right_x - left_x
result: List[RowGeometry] = []
split_total = 0
for row in rows:
if row.row_type != 'content' or row.height <= threshold:
result.append(row)
continue
# --- Local horizontal projection on this row's strip ---
row_y_abs = row.y
row_h = row.height
strip = inv[row_y_abs:row_y_abs + row_h, left_x:right_x]
if strip.size == 0:
result.append(row)
continue
# Word-coverage mask (same approach as main detection)
pad_y = max(2, row_h // 50)
word_mask = np.zeros_like(strip)
row_words = [w for w in word_dicts
if w['top'] + top_y >= row_y_abs - pad_y
and w['top'] + top_y < row_y_abs + row_h + pad_y]
for wd in row_words:
wy = wd['top'] + top_y - row_y_abs # relative to strip
y1 = max(0, wy - pad_y)
y2 = min(row_h, wy + wd['height'] + pad_y)
x1 = max(0, wd['left'])
x2 = min(content_w, wd['left'] + wd['width'])
word_mask[y1:y2, x1:x2] = 255
masked = cv2.bitwise_and(strip, word_mask)
h_proj = np.sum(masked, axis=1).astype(float)
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
# Smooth
k = max(3, row_h // 40)
if k % 2 == 0:
k += 1
h_smooth = np.convolve(h_proj_norm, np.ones(k) / k, mode='same')
# Gap detection within the row
med_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
gap_thresh = max(med_density * 0.15, 0.003)
in_gap = h_smooth < gap_thresh
min_gap_h = max(2, row_h // 30) # smaller threshold for sub-gaps
local_gaps = []
gap_start = None
for y in range(len(in_gap)):
if in_gap[y]:
if gap_start is None:
gap_start = y
else:
if gap_start is not None:
if y - gap_start >= min_gap_h:
local_gaps.append((gap_start, y))
gap_start = None
if gap_start is not None and len(in_gap) - gap_start >= min_gap_h:
local_gaps.append((gap_start, len(in_gap)))
if not local_gaps:
# No sub-gaps found — keep original row
result.append(row)
continue
# Validate gaps against words (don't split through a word)
valid_gaps = []
for gs, ge in local_gaps:
overlapping = False
for wd in row_words:
wy = wd['top'] + top_y - row_y_abs
wy2 = wy + wd['height']
if wy < ge and wy2 > gs:
overlapping = True
break
if not overlapping:
valid_gaps.append((gs, ge))
if not valid_gaps:
result.append(row)
continue
valid_gaps.sort()
# Build sub-row boundaries from gaps
sub_bounds = [] # (start_rel, end_rel) within the row strip
# Before first gap
if valid_gaps[0][0] > 0:
sub_bounds.append((0, valid_gaps[0][0]))
# Between gaps
for i in range(len(valid_gaps) - 1):
sub_bounds.append((valid_gaps[i][1], valid_gaps[i + 1][0]))
# After last gap
if valid_gaps[-1][1] < row_h:
sub_bounds.append((valid_gaps[-1][1], row_h))
# Filter out empty sub-rows
sub_bounds = [(s, e) for s, e in sub_bounds if e - s > 2]
if len(sub_bounds) <= 1:
result.append(row)
continue
# Create sub-rows
for sb_start, sb_end in sub_bounds:
sub_y_abs = row_y_abs + sb_start
sub_h = sb_end - sb_start
# Assign words to this sub-row
sub_words = [w for w in row_words
if w['top'] + top_y >= sub_y_abs - 2
and w['top'] + top_y + w['height'] <= sub_y_abs + sub_h + 2]
result.append(RowGeometry(
index=0, # re-indexed below
x=row.x,
y=sub_y_abs,
width=row.width,
height=sub_h,
word_count=len(sub_words),
words=sub_words,
row_type='content',
gap_before=0,
))
split_total += len(sub_bounds) - 1
logger.info(f"RowGeometry: split oversized row (h={row_h}) "
f"into {len(sub_bounds)} sub-rows "
f"(median_h={median_h}, {len(valid_gaps)} gaps)")
if split_total > 0:
# Re-index all rows
result.sort(key=lambda r: r.y)
for i, r in enumerate(result):
r.index = i
logger.info(f"RowGeometry: {split_total} oversized splits → "
f"{len(result)} total rows (was {len(rows)})")
return result
def _build_rows_from_word_grouping(
word_dicts: List[Dict],
left_x: int, right_x: int,
@@ -2708,119 +2883,6 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
return _PHONETIC_BRACKET_RE.sub(replacer, text)
def _split_oversized_entries(
entries: List[Dict[str, Any]],
content_rows: List[RowGeometry],
img_w: int,
img_h: int,
) -> List[Dict[str, Any]]:
"""Split entries from oversized rows into multiple entries.
If a row is >1.5× the median height, it likely contains multiple vocabulary
entries that Step 4 failed to separate. We split based on line count:
if EN and DE have the same number of newline-separated lines, each line
becomes its own entry.
This is a deterministic plausibility check — no LLM needed.
"""
if len(entries) < 3:
return entries
# Calculate median row height from pixel heights
row_heights = [r.height for r in content_rows]
row_heights_sorted = sorted(row_heights)
median_h = row_heights_sorted[len(row_heights_sorted) // 2]
if median_h <= 0:
return entries
height_threshold = median_h * 1.5
result: List[Dict[str, Any]] = []
split_count = 0
for entry in entries:
# Get pixel height from bbox percent
entry_h_px = entry['bbox']['h'] / 100.0 * img_h
if entry_h_px <= height_threshold:
result.append(entry)
continue
# This row is oversized — check if we can split
en_lines = entry['english'].split('\n') if entry['english'] else ['']
de_lines = entry['german'].split('\n') if entry['german'] else ['']
ex_lines = entry['example'].split('\n') if entry['example'] else ['']
# Filter empty lines
en_lines = [l for l in en_lines if l.strip()] or ['']
de_lines = [l for l in de_lines if l.strip()] or ['']
ex_lines = [l for l in ex_lines if l.strip()] or ['']
# Determine split count: EN and DE must agree (or one is empty)
n_en = len(en_lines)
n_de = len(de_lines)
n_ex = len(ex_lines)
can_split = False
n_split = 1
if n_en > 1 and n_de > 1 and n_en == n_de:
n_split = n_en
can_split = True
elif n_en > 1 and n_de <= 1:
# Only EN has multiple lines — still split, DE goes to first
n_split = n_en
can_split = True
elif n_de > 1 and n_en <= 1:
# Only DE has multiple lines
n_split = n_de
can_split = True
if not can_split or n_split <= 1:
result.append(entry)
continue
# Split into n_split sub-entries
orig_y = entry['bbox']['y']
orig_h = entry['bbox']['h']
sub_h = orig_h / n_split
for k in range(n_split):
sub_entry = {
'row_index': entry['row_index'],
'english': en_lines[k] if k < len(en_lines) else '',
'german': de_lines[k] if k < len(de_lines) else '',
'example': ex_lines[k] if k < len(ex_lines) else '',
'confidence': entry['confidence'],
'bbox': {
'x': entry['bbox']['x'],
'y': round(orig_y + k * sub_h, 2),
'w': entry['bbox']['w'],
'h': round(sub_h, 2),
},
'bbox_en': entry['bbox_en'],
'bbox_de': entry['bbox_de'],
'bbox_ex': entry['bbox_ex'],
'ocr_engine': entry.get('ocr_engine', ''),
'split_from_row': entry['row_index'],
}
result.append(sub_entry)
split_count += 1
logger.info(f"split_oversized: row {entry['row_index']} "
f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) "
f"{n_split} sub-entries")
if split_count > 0:
# Re-number row indices
for i, e in enumerate(result):
e['row_index'] = i
logger.info(f"split_oversized: {split_count} rows split, "
f"{len(entries)}{len(result)} entries")
return result
def build_word_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
@@ -2885,10 +2947,6 @@ def build_word_grid(
entries: List[Dict[str, Any]] = []
# Calculate median row height for oversized detection
row_heights = sorted(r.height for r in content_rows)
median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
for row_idx, row in enumerate(content_rows):
entry: Dict[str, Any] = {
'row_index': row_idx,
@@ -2930,40 +2988,17 @@ def build_word_grid(
if cell_w <= 0 or cell_h <= 0:
continue
# For oversized cells (>1.5× median), split vertically into sub-cells
# and OCR each separately. This prevents OCR from missing text at
# the bottom of tall cells (RapidOCR downscales tall narrow crops).
is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
if is_oversized:
n_splits = max(2, round(row.height / median_row_h))
sub_h = cell_h / n_splits
words = []
for s in range(n_splits):
sub_y = int(cell_y + s * sub_h)
sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
sub_region = PageRegion(
type=col.type,
x=cell_x, y=sub_y,
width=cell_w, height=max(1, sub_height),
)
if use_rapid:
sub_words = ocr_region_rapid(img_bgr, sub_region)
else:
cell_lang = lang_map.get(col.type, lang)
sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
words.extend(sub_words)
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
# OCR the cell
if use_rapid:
words = ocr_region_rapid(img_bgr, cell_region)
else:
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
# OCR the cell
if use_rapid:
words = ocr_region_rapid(img_bgr, cell_region)
else:
cell_lang = lang_map.get(col.type, lang)
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
cell_lang = lang_map.get(col.type, lang)
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
# Group into lines, then join in reading order (Fix A)
# Use half of average word height as Y-tolerance
@@ -3007,16 +3042,13 @@ def build_word_grid(
# --- Post-processing pipeline (deterministic, no LLM) ---
n_raw = len(entries)
# 1. Split oversized rows (missed Step 4 boundaries)
entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
# 2. Fix character confusion (I/1/l based on context)
# 1. Fix character confusion (I/1/l based on context)
entries = _fix_character_confusion(entries)
# 3. Replace OCR'd phonetics with dictionary IPA
# 2. Replace OCR'd phonetics with dictionary IPA
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
# 4. Split comma-separated word forms (break, broke, broken → 3 entries)
# 3. Split comma-separated word forms (break, broke, broken → 3 entries)
entries = _split_comma_entries(entries)
# 5. Attach example sentences (rows without DE → examples for preceding entry)