feat(ocr-pipeline): move oversized row splitting from Step 5 to Step 4
Implement _split_oversized_rows() in detect_row_geometry() (Step 7) to split content rows >1.5× median height using local horizontal projection. This produces correctly-sized rows before word OCR runs, instead of working around the issue in Step 5 with sub-cell splitting hacks. Removed Step 5 workarounds: _split_oversized_entries(), sub-cell splitting in build_word_grid(), and median_row_h calculation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1539,6 +1539,11 @@ def detect_row_geometry(
|
|||||||
gap_before=gap_before,
|
gap_before=gap_before,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# --- Step 7: Split oversized rows ---
|
||||||
|
# If a content row is >1.5× the median height, re-analyze it with a local
|
||||||
|
# horizontal projection to find missed row boundaries within.
|
||||||
|
rows = _split_oversized_rows(rows, inv, left_x, right_x, top_y, word_dicts)
|
||||||
|
|
||||||
type_counts = {}
|
type_counts = {}
|
||||||
for r in rows:
|
for r in rows:
|
||||||
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||||||
@@ -1547,6 +1552,176 @@ def detect_row_geometry(
|
|||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _split_oversized_rows(
|
||||||
|
rows: List['RowGeometry'],
|
||||||
|
inv: np.ndarray,
|
||||||
|
left_x: int, right_x: int,
|
||||||
|
top_y: int,
|
||||||
|
word_dicts: List[Dict],
|
||||||
|
) -> List['RowGeometry']:
|
||||||
|
"""Split content rows that are >1.5× the median height.
|
||||||
|
|
||||||
|
Re-analyses oversized rows with a local horizontal projection profile
|
||||||
|
to find missed row boundaries within. This catches cases where Step 4's
|
||||||
|
initial gap analysis merged multiple vocabulary lines (e.g. because an
|
||||||
|
image or dense text obscured the gap).
|
||||||
|
|
||||||
|
Returns a new list with oversized rows replaced by sub-rows, re-indexed.
|
||||||
|
"""
|
||||||
|
content_rows = [r for r in rows if r.row_type == 'content']
|
||||||
|
if len(content_rows) < 3:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
heights = sorted(r.height for r in content_rows)
|
||||||
|
median_h = heights[len(heights) // 2]
|
||||||
|
|
||||||
|
if median_h <= 10:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
threshold = median_h * 1.5
|
||||||
|
content_w = right_x - left_x
|
||||||
|
|
||||||
|
result: List[RowGeometry] = []
|
||||||
|
split_total = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
if row.row_type != 'content' or row.height <= threshold:
|
||||||
|
result.append(row)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# --- Local horizontal projection on this row's strip ---
|
||||||
|
row_y_abs = row.y
|
||||||
|
row_h = row.height
|
||||||
|
strip = inv[row_y_abs:row_y_abs + row_h, left_x:right_x]
|
||||||
|
|
||||||
|
if strip.size == 0:
|
||||||
|
result.append(row)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Word-coverage mask (same approach as main detection)
|
||||||
|
pad_y = max(2, row_h // 50)
|
||||||
|
word_mask = np.zeros_like(strip)
|
||||||
|
row_words = [w for w in word_dicts
|
||||||
|
if w['top'] + top_y >= row_y_abs - pad_y
|
||||||
|
and w['top'] + top_y < row_y_abs + row_h + pad_y]
|
||||||
|
|
||||||
|
for wd in row_words:
|
||||||
|
wy = wd['top'] + top_y - row_y_abs # relative to strip
|
||||||
|
y1 = max(0, wy - pad_y)
|
||||||
|
y2 = min(row_h, wy + wd['height'] + pad_y)
|
||||||
|
x1 = max(0, wd['left'])
|
||||||
|
x2 = min(content_w, wd['left'] + wd['width'])
|
||||||
|
word_mask[y1:y2, x1:x2] = 255
|
||||||
|
|
||||||
|
masked = cv2.bitwise_and(strip, word_mask)
|
||||||
|
h_proj = np.sum(masked, axis=1).astype(float)
|
||||||
|
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||||||
|
|
||||||
|
# Smooth
|
||||||
|
k = max(3, row_h // 40)
|
||||||
|
if k % 2 == 0:
|
||||||
|
k += 1
|
||||||
|
h_smooth = np.convolve(h_proj_norm, np.ones(k) / k, mode='same')
|
||||||
|
|
||||||
|
# Gap detection within the row
|
||||||
|
med_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
|
||||||
|
gap_thresh = max(med_density * 0.15, 0.003)
|
||||||
|
in_gap = h_smooth < gap_thresh
|
||||||
|
|
||||||
|
min_gap_h = max(2, row_h // 30) # smaller threshold for sub-gaps
|
||||||
|
local_gaps = []
|
||||||
|
gap_start = None
|
||||||
|
for y in range(len(in_gap)):
|
||||||
|
if in_gap[y]:
|
||||||
|
if gap_start is None:
|
||||||
|
gap_start = y
|
||||||
|
else:
|
||||||
|
if gap_start is not None:
|
||||||
|
if y - gap_start >= min_gap_h:
|
||||||
|
local_gaps.append((gap_start, y))
|
||||||
|
gap_start = None
|
||||||
|
if gap_start is not None and len(in_gap) - gap_start >= min_gap_h:
|
||||||
|
local_gaps.append((gap_start, len(in_gap)))
|
||||||
|
|
||||||
|
if not local_gaps:
|
||||||
|
# No sub-gaps found — keep original row
|
||||||
|
result.append(row)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Validate gaps against words (don't split through a word)
|
||||||
|
valid_gaps = []
|
||||||
|
for gs, ge in local_gaps:
|
||||||
|
overlapping = False
|
||||||
|
for wd in row_words:
|
||||||
|
wy = wd['top'] + top_y - row_y_abs
|
||||||
|
wy2 = wy + wd['height']
|
||||||
|
if wy < ge and wy2 > gs:
|
||||||
|
overlapping = True
|
||||||
|
break
|
||||||
|
if not overlapping:
|
||||||
|
valid_gaps.append((gs, ge))
|
||||||
|
|
||||||
|
if not valid_gaps:
|
||||||
|
result.append(row)
|
||||||
|
continue
|
||||||
|
|
||||||
|
valid_gaps.sort()
|
||||||
|
|
||||||
|
# Build sub-row boundaries from gaps
|
||||||
|
sub_bounds = [] # (start_rel, end_rel) within the row strip
|
||||||
|
# Before first gap
|
||||||
|
if valid_gaps[0][0] > 0:
|
||||||
|
sub_bounds.append((0, valid_gaps[0][0]))
|
||||||
|
# Between gaps
|
||||||
|
for i in range(len(valid_gaps) - 1):
|
||||||
|
sub_bounds.append((valid_gaps[i][1], valid_gaps[i + 1][0]))
|
||||||
|
# After last gap
|
||||||
|
if valid_gaps[-1][1] < row_h:
|
||||||
|
sub_bounds.append((valid_gaps[-1][1], row_h))
|
||||||
|
|
||||||
|
# Filter out empty sub-rows
|
||||||
|
sub_bounds = [(s, e) for s, e in sub_bounds if e - s > 2]
|
||||||
|
|
||||||
|
if len(sub_bounds) <= 1:
|
||||||
|
result.append(row)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create sub-rows
|
||||||
|
for sb_start, sb_end in sub_bounds:
|
||||||
|
sub_y_abs = row_y_abs + sb_start
|
||||||
|
sub_h = sb_end - sb_start
|
||||||
|
# Assign words to this sub-row
|
||||||
|
sub_words = [w for w in row_words
|
||||||
|
if w['top'] + top_y >= sub_y_abs - 2
|
||||||
|
and w['top'] + top_y + w['height'] <= sub_y_abs + sub_h + 2]
|
||||||
|
result.append(RowGeometry(
|
||||||
|
index=0, # re-indexed below
|
||||||
|
x=row.x,
|
||||||
|
y=sub_y_abs,
|
||||||
|
width=row.width,
|
||||||
|
height=sub_h,
|
||||||
|
word_count=len(sub_words),
|
||||||
|
words=sub_words,
|
||||||
|
row_type='content',
|
||||||
|
gap_before=0,
|
||||||
|
))
|
||||||
|
|
||||||
|
split_total += len(sub_bounds) - 1
|
||||||
|
logger.info(f"RowGeometry: split oversized row (h={row_h}) "
|
||||||
|
f"into {len(sub_bounds)} sub-rows "
|
||||||
|
f"(median_h={median_h}, {len(valid_gaps)} gaps)")
|
||||||
|
|
||||||
|
if split_total > 0:
|
||||||
|
# Re-index all rows
|
||||||
|
result.sort(key=lambda r: r.y)
|
||||||
|
for i, r in enumerate(result):
|
||||||
|
r.index = i
|
||||||
|
logger.info(f"RowGeometry: {split_total} oversized splits → "
|
||||||
|
f"{len(result)} total rows (was {len(rows)})")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _build_rows_from_word_grouping(
|
def _build_rows_from_word_grouping(
|
||||||
word_dicts: List[Dict],
|
word_dicts: List[Dict],
|
||||||
left_x: int, right_x: int,
|
left_x: int, right_x: int,
|
||||||
@@ -2708,119 +2883,6 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
|||||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||||
|
|
||||||
|
|
||||||
def _split_oversized_entries(
|
|
||||||
entries: List[Dict[str, Any]],
|
|
||||||
content_rows: List[RowGeometry],
|
|
||||||
img_w: int,
|
|
||||||
img_h: int,
|
|
||||||
) -> List[Dict[str, Any]]:
|
|
||||||
"""Split entries from oversized rows into multiple entries.
|
|
||||||
|
|
||||||
If a row is >1.5× the median height, it likely contains multiple vocabulary
|
|
||||||
entries that Step 4 failed to separate. We split based on line count:
|
|
||||||
if EN and DE have the same number of newline-separated lines, each line
|
|
||||||
becomes its own entry.
|
|
||||||
|
|
||||||
This is a deterministic plausibility check — no LLM needed.
|
|
||||||
"""
|
|
||||||
if len(entries) < 3:
|
|
||||||
return entries
|
|
||||||
|
|
||||||
# Calculate median row height from pixel heights
|
|
||||||
row_heights = [r.height for r in content_rows]
|
|
||||||
row_heights_sorted = sorted(row_heights)
|
|
||||||
median_h = row_heights_sorted[len(row_heights_sorted) // 2]
|
|
||||||
|
|
||||||
if median_h <= 0:
|
|
||||||
return entries
|
|
||||||
|
|
||||||
height_threshold = median_h * 1.5
|
|
||||||
result: List[Dict[str, Any]] = []
|
|
||||||
split_count = 0
|
|
||||||
|
|
||||||
for entry in entries:
|
|
||||||
# Get pixel height from bbox percent
|
|
||||||
entry_h_px = entry['bbox']['h'] / 100.0 * img_h
|
|
||||||
|
|
||||||
if entry_h_px <= height_threshold:
|
|
||||||
result.append(entry)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# This row is oversized — check if we can split
|
|
||||||
en_lines = entry['english'].split('\n') if entry['english'] else ['']
|
|
||||||
de_lines = entry['german'].split('\n') if entry['german'] else ['']
|
|
||||||
ex_lines = entry['example'].split('\n') if entry['example'] else ['']
|
|
||||||
|
|
||||||
# Filter empty lines
|
|
||||||
en_lines = [l for l in en_lines if l.strip()] or ['']
|
|
||||||
de_lines = [l for l in de_lines if l.strip()] or ['']
|
|
||||||
ex_lines = [l for l in ex_lines if l.strip()] or ['']
|
|
||||||
|
|
||||||
# Determine split count: EN and DE must agree (or one is empty)
|
|
||||||
n_en = len(en_lines)
|
|
||||||
n_de = len(de_lines)
|
|
||||||
n_ex = len(ex_lines)
|
|
||||||
|
|
||||||
can_split = False
|
|
||||||
n_split = 1
|
|
||||||
|
|
||||||
if n_en > 1 and n_de > 1 and n_en == n_de:
|
|
||||||
n_split = n_en
|
|
||||||
can_split = True
|
|
||||||
elif n_en > 1 and n_de <= 1:
|
|
||||||
# Only EN has multiple lines — still split, DE goes to first
|
|
||||||
n_split = n_en
|
|
||||||
can_split = True
|
|
||||||
elif n_de > 1 and n_en <= 1:
|
|
||||||
# Only DE has multiple lines
|
|
||||||
n_split = n_de
|
|
||||||
can_split = True
|
|
||||||
|
|
||||||
if not can_split or n_split <= 1:
|
|
||||||
result.append(entry)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Split into n_split sub-entries
|
|
||||||
orig_y = entry['bbox']['y']
|
|
||||||
orig_h = entry['bbox']['h']
|
|
||||||
sub_h = orig_h / n_split
|
|
||||||
|
|
||||||
for k in range(n_split):
|
|
||||||
sub_entry = {
|
|
||||||
'row_index': entry['row_index'],
|
|
||||||
'english': en_lines[k] if k < len(en_lines) else '',
|
|
||||||
'german': de_lines[k] if k < len(de_lines) else '',
|
|
||||||
'example': ex_lines[k] if k < len(ex_lines) else '',
|
|
||||||
'confidence': entry['confidence'],
|
|
||||||
'bbox': {
|
|
||||||
'x': entry['bbox']['x'],
|
|
||||||
'y': round(orig_y + k * sub_h, 2),
|
|
||||||
'w': entry['bbox']['w'],
|
|
||||||
'h': round(sub_h, 2),
|
|
||||||
},
|
|
||||||
'bbox_en': entry['bbox_en'],
|
|
||||||
'bbox_de': entry['bbox_de'],
|
|
||||||
'bbox_ex': entry['bbox_ex'],
|
|
||||||
'ocr_engine': entry.get('ocr_engine', ''),
|
|
||||||
'split_from_row': entry['row_index'],
|
|
||||||
}
|
|
||||||
result.append(sub_entry)
|
|
||||||
|
|
||||||
split_count += 1
|
|
||||||
logger.info(f"split_oversized: row {entry['row_index']} "
|
|
||||||
f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) "
|
|
||||||
f"→ {n_split} sub-entries")
|
|
||||||
|
|
||||||
if split_count > 0:
|
|
||||||
# Re-number row indices
|
|
||||||
for i, e in enumerate(result):
|
|
||||||
e['row_index'] = i
|
|
||||||
logger.info(f"split_oversized: {split_count} rows split, "
|
|
||||||
f"{len(entries)} → {len(result)} entries")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def build_word_grid(
|
def build_word_grid(
|
||||||
ocr_img: np.ndarray,
|
ocr_img: np.ndarray,
|
||||||
column_regions: List[PageRegion],
|
column_regions: List[PageRegion],
|
||||||
@@ -2885,10 +2947,6 @@ def build_word_grid(
|
|||||||
|
|
||||||
entries: List[Dict[str, Any]] = []
|
entries: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
# Calculate median row height for oversized detection
|
|
||||||
row_heights = sorted(r.height for r in content_rows)
|
|
||||||
median_row_h = row_heights[len(row_heights) // 2] if row_heights else 100
|
|
||||||
|
|
||||||
for row_idx, row in enumerate(content_rows):
|
for row_idx, row in enumerate(content_rows):
|
||||||
entry: Dict[str, Any] = {
|
entry: Dict[str, Any] = {
|
||||||
'row_index': row_idx,
|
'row_index': row_idx,
|
||||||
@@ -2930,40 +2988,17 @@ def build_word_grid(
|
|||||||
if cell_w <= 0 or cell_h <= 0:
|
if cell_w <= 0 or cell_h <= 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# For oversized cells (>1.5× median), split vertically into sub-cells
|
cell_region = PageRegion(
|
||||||
# and OCR each separately. This prevents OCR from missing text at
|
type=col.type,
|
||||||
# the bottom of tall cells (RapidOCR downscales tall narrow crops).
|
x=cell_x, y=cell_y,
|
||||||
is_oversized = row.height > median_row_h * 1.5 and median_row_h > 20
|
width=cell_w, height=cell_h,
|
||||||
if is_oversized:
|
)
|
||||||
n_splits = max(2, round(row.height / median_row_h))
|
# OCR the cell
|
||||||
sub_h = cell_h / n_splits
|
if use_rapid:
|
||||||
words = []
|
words = ocr_region_rapid(img_bgr, cell_region)
|
||||||
for s in range(n_splits):
|
|
||||||
sub_y = int(cell_y + s * sub_h)
|
|
||||||
sub_height = int(sub_h) if s < n_splits - 1 else (cell_y + cell_h - sub_y)
|
|
||||||
sub_region = PageRegion(
|
|
||||||
type=col.type,
|
|
||||||
x=cell_x, y=sub_y,
|
|
||||||
width=cell_w, height=max(1, sub_height),
|
|
||||||
)
|
|
||||||
if use_rapid:
|
|
||||||
sub_words = ocr_region_rapid(img_bgr, sub_region)
|
|
||||||
else:
|
|
||||||
cell_lang = lang_map.get(col.type, lang)
|
|
||||||
sub_words = ocr_region(ocr_img, sub_region, lang=cell_lang, psm=6)
|
|
||||||
words.extend(sub_words)
|
|
||||||
else:
|
else:
|
||||||
cell_region = PageRegion(
|
cell_lang = lang_map.get(col.type, lang)
|
||||||
type=col.type,
|
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||||
x=cell_x, y=cell_y,
|
|
||||||
width=cell_w, height=cell_h,
|
|
||||||
)
|
|
||||||
# OCR the cell
|
|
||||||
if use_rapid:
|
|
||||||
words = ocr_region_rapid(img_bgr, cell_region)
|
|
||||||
else:
|
|
||||||
cell_lang = lang_map.get(col.type, lang)
|
|
||||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
|
||||||
|
|
||||||
# Group into lines, then join in reading order (Fix A)
|
# Group into lines, then join in reading order (Fix A)
|
||||||
# Use half of average word height as Y-tolerance
|
# Use half of average word height as Y-tolerance
|
||||||
@@ -3007,16 +3042,13 @@ def build_word_grid(
|
|||||||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||||
n_raw = len(entries)
|
n_raw = len(entries)
|
||||||
|
|
||||||
# 1. Split oversized rows (missed Step 4 boundaries)
|
# 1. Fix character confusion (I/1/l based on context)
|
||||||
entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
|
|
||||||
|
|
||||||
# 2. Fix character confusion (I/1/l based on context)
|
|
||||||
entries = _fix_character_confusion(entries)
|
entries = _fix_character_confusion(entries)
|
||||||
|
|
||||||
# 3. Replace OCR'd phonetics with dictionary IPA
|
# 2. Replace OCR'd phonetics with dictionary IPA
|
||||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||||
|
|
||||||
# 4. Split comma-separated word forms (break, broke, broken → 3 entries)
|
# 3. Split comma-separated word forms (break, broke, broken → 3 entries)
|
||||||
entries = _split_comma_entries(entries)
|
entries = _split_comma_entries(entries)
|
||||||
|
|
||||||
# 5. Attach example sentences (rows without DE → examples for preceding entry)
|
# 5. Attach example sentences (rows without DE → examples for preceding entry)
|
||||||
|
|||||||
Reference in New Issue
Block a user