feat(ocr-pipeline): uniform grid regularization for row detection (Step 7)

Replace _split_oversized_rows() with _regularize_row_grid(). When ≥60%
of content rows have consistent height (±25% of median), overlay a
uniform grid with the standard row height over the entire content area.
This leverages the fact that books/vocab lists use constant row heights.

Validates grid by checking ≥85% of words land in a grid row. Falls back
to gap-based rows if heights are too irregular or words don't fit.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-01 11:50:50 +01:00
parent ba65e47654
commit ec47045c15

View File

@@ -1539,10 +1539,11 @@ def detect_row_geometry(
gap_before=gap_before, gap_before=gap_before,
)) ))
# --- Step 7: Split oversized rows --- # --- Step 7: Uniform grid regularization ---
# If a content row is >1.5× the median height, re-analyze it with a local # Books and vocab lists use a constant row height. If most detected rows
# horizontal projection to find missed row boundaries within. # agree on a height, overlay a uniform grid to fix oversized rows.
rows = _split_oversized_rows(rows, inv, left_x, right_x, top_y, word_dicts) rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
content_w, content_h, inv)
type_counts = {} type_counts = {}
for r in rows: for r in rows:
@@ -1552,172 +1553,154 @@ def detect_row_geometry(
return rows return rows
def _split_oversized_rows( def _regularize_row_grid(
rows: List['RowGeometry'], rows: List['RowGeometry'],
inv: np.ndarray, word_dicts: List[Dict],
left_x: int, right_x: int, left_x: int, right_x: int,
top_y: int, top_y: int,
word_dicts: List[Dict], content_w: int, content_h: int,
inv: np.ndarray,
) -> List['RowGeometry']: ) -> List['RowGeometry']:
"""Split content rows that are >1.5× the median height. """Replace gap-based rows with a uniform grid when row heights are consistent.
Re-analyses oversized rows with a local horizontal projection profile Books and vocabulary lists use a constant row height throughout the page.
to find missed row boundaries within. This catches cases where Step 4's If ≥60% of detected content rows have a height within ±25% of the median,
initial gap analysis merged multiple vocabulary lines (e.g. because an we overlay a uniform grid with that height over the entire content area.
image or dense text obscured the gap). This naturally fixes oversized rows without special-case splitting.
Returns a new list with oversized rows replaced by sub-rows, re-indexed. Header/footer rows are preserved as-is.
Falls back to returning the original rows if the heights are too irregular.
""" """
content_rows = [r for r in rows if r.row_type == 'content'] content_rows = [r for r in rows if r.row_type == 'content']
if len(content_rows) < 3: non_content = [r for r in rows if r.row_type != 'content']
if len(content_rows) < 5:
# Not enough rows to establish a reliable pattern
return rows return rows
heights = sorted(r.height for r in content_rows) heights = [r.height for r in content_rows]
median_h = heights[len(heights) // 2] heights_sorted = sorted(heights)
median_h = heights_sorted[len(heights_sorted) // 2]
if median_h <= 10: if median_h <= 10:
return rows return rows
threshold = median_h * 1.5 # Check consistency: how many rows are within ±25% of median?
content_w = right_x - left_x tolerance = 0.25
lo = median_h * (1 - tolerance)
hi = median_h * (1 + tolerance)
consistent = sum(1 for h in heights if lo <= h <= hi)
consistency_ratio = consistent / len(heights)
result: List[RowGeometry] = [] if consistency_ratio < 0.6:
split_total = 0 logger.info(f"RowGrid: inconsistent heights ({consistency_ratio:.0%} within "
f"±{tolerance:.0%} of median {median_h}px), keeping gap-based rows")
return rows
for row in rows: # --- Determine the standard row height more precisely ---
if row.row_type != 'content' or row.height <= threshold: # Use the mean of consistent rows (those within tolerance) for stability
result.append(row) consistent_heights = [h for h in heights if lo <= h <= hi]
continue std_height = round(sum(consistent_heights) / len(consistent_heights))
# --- Local horizontal projection on this row's strip --- # --- Determine content zone (between header/footer) ---
row_y_abs = row.y content_start_abs = min(r.y for r in content_rows)
row_h = row.height content_end_abs = max(r.y + r.height for r in content_rows)
strip = inv[row_y_abs:row_y_abs + row_h, left_x:right_x]
if strip.size == 0: # Snap to nearest grid line from the first detected content row
result.append(row) # Use the first well-sized content row's top as anchor
continue anchor_y = content_start_abs
for r in content_rows:
if lo <= r.height <= hi:
anchor_y = r.y
break
# Word-coverage mask (same approach as main detection) # --- Build uniform grid ---
pad_y = max(2, row_h // 50) # Extend grid upward from anchor to cover content_start_abs
word_mask = np.zeros_like(strip) grid_start = anchor_y
while grid_start - std_height >= content_start_abs - std_height * 0.3:
if grid_start - std_height < content_start_abs - std_height * 0.5:
break
grid_start -= std_height
# Generate grid lines from grid_start to content_end_abs
grid_rows: List[RowGeometry] = []
y = grid_start
idx = 0
while y < content_end_abs - std_height * 0.3:
row_y = y
row_h = std_height
# Last row: extend to content_end if remainder > 30% of std_height
if y + std_height >= content_end_abs:
row_h = content_end_abs - y
if row_h < std_height * 0.3:
break # too small, skip
# Assign words whose vertical center falls in this grid row
row_words = [w for w in word_dicts row_words = [w for w in word_dicts
if w['top'] + top_y >= row_y_abs - pad_y if w['top'] + top_y >= row_y - 2
and w['top'] + top_y < row_y_abs + row_h + pad_y] and w['top'] + w['height'] / 2 + top_y < row_y + row_h + 2]
for wd in row_words: grid_rows.append(RowGeometry(
wy = wd['top'] + top_y - row_y_abs # relative to strip index=idx,
y1 = max(0, wy - pad_y) x=left_x,
y2 = min(row_h, wy + wd['height'] + pad_y) y=round(row_y),
x1 = max(0, wd['left']) width=content_w,
x2 = min(content_w, wd['left'] + wd['width']) height=round(row_h),
word_mask[y1:y2, x1:x2] = 255 word_count=len(row_words),
words=row_words,
row_type='content',
gap_before=0,
))
masked = cv2.bitwise_and(strip, word_mask) idx += 1
h_proj = np.sum(masked, axis=1).astype(float) y += std_height
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
# Smooth if not grid_rows:
k = max(3, row_h // 40) return rows
if k % 2 == 0:
k += 1
h_smooth = np.convolve(h_proj_norm, np.ones(k) / k, mode='same')
# Gap detection within the row # --- Validate: check that words fit the grid well ---
med_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01 # Count words that land in exactly one grid row
gap_thresh = max(med_density * 0.15, 0.003) all_content_words = []
in_gap = h_smooth < gap_thresh for r in content_rows:
all_content_words.extend(r.words)
# Deduplicate by position
seen = set()
unique_words = []
for w in all_content_words:
key = (w['left'], w['top'], w['width'], w['height'])
if key not in seen:
seen.add(key)
unique_words.append(w)
min_gap_h = max(2, row_h // 30) # smaller threshold for sub-gaps if unique_words:
local_gaps = [] matched = 0
gap_start = None for w in unique_words:
for y in range(len(in_gap)): w_center_y = w['top'] + top_y + w['height'] / 2
if in_gap[y]: for gr in grid_rows:
if gap_start is None: if gr.y <= w_center_y < gr.y + gr.height:
gap_start = y matched += 1
else:
if gap_start is not None:
if y - gap_start >= min_gap_h:
local_gaps.append((gap_start, y))
gap_start = None
if gap_start is not None and len(in_gap) - gap_start >= min_gap_h:
local_gaps.append((gap_start, len(in_gap)))
if not local_gaps:
# No sub-gaps found — keep original row
result.append(row)
continue
# Validate gaps against words (don't split through a word)
valid_gaps = []
for gs, ge in local_gaps:
overlapping = False
for wd in row_words:
wy = wd['top'] + top_y - row_y_abs
wy2 = wy + wd['height']
if wy < ge and wy2 > gs:
overlapping = True
break break
if not overlapping: match_ratio = matched / len(unique_words)
valid_gaps.append((gs, ge))
if not valid_gaps: if match_ratio < 0.85:
result.append(row) logger.info(f"RowGrid: grid only matches {match_ratio:.0%} of words, "
continue f"keeping gap-based rows")
return rows
valid_gaps.sort() # --- Merge header/footer rows back ---
result = list(non_content) + grid_rows
result.sort(key=lambda r: r.y)
for i, r in enumerate(result):
r.index = i
# Build sub-row boundaries from gaps n_oversized = sum(1 for r in content_rows if r.height > std_height * 1.5)
sub_bounds = [] # (start_rel, end_rel) within the row strip logger.info(f"RowGrid: uniform grid applied (std_height={std_height}px, "
# Before first gap f"{len(grid_rows)} grid rows, was {len(content_rows)} content rows, "
if valid_gaps[0][0] > 0: f"{n_oversized} were oversized, "
sub_bounds.append((0, valid_gaps[0][0])) f"consistency={consistency_ratio:.0%})")
# Between gaps
for i in range(len(valid_gaps) - 1):
sub_bounds.append((valid_gaps[i][1], valid_gaps[i + 1][0]))
# After last gap
if valid_gaps[-1][1] < row_h:
sub_bounds.append((valid_gaps[-1][1], row_h))
# Filter out empty sub-rows
sub_bounds = [(s, e) for s, e in sub_bounds if e - s > 2]
if len(sub_bounds) <= 1:
result.append(row)
continue
# Create sub-rows
for sb_start, sb_end in sub_bounds:
sub_y_abs = row_y_abs + sb_start
sub_h = sb_end - sb_start
# Assign words to this sub-row
sub_words = [w for w in row_words
if w['top'] + top_y >= sub_y_abs - 2
and w['top'] + top_y + w['height'] <= sub_y_abs + sub_h + 2]
result.append(RowGeometry(
index=0, # re-indexed below
x=row.x,
y=sub_y_abs,
width=row.width,
height=sub_h,
word_count=len(sub_words),
words=sub_words,
row_type='content',
gap_before=0,
))
split_total += len(sub_bounds) - 1
logger.info(f"RowGeometry: split oversized row (h={row_h}) "
f"into {len(sub_bounds)} sub-rows "
f"(median_h={median_h}, {len(valid_gaps)} gaps)")
if split_total > 0:
# Re-index all rows
result.sort(key=lambda r: r.y)
for i, r in enumerate(result):
r.index = i
logger.info(f"RowGeometry: {split_total} oversized splits → "
f"{len(result)} total rows (was {len(rows)})")
return result return result