fix(ocr-pipeline): group words by vertical center, merge close clusters

Fix half-height rows caused by tall special characters (brackets, IPA
symbols) being split into separate line clusters:

- Group words by vertical CENTER instead of TOP position, so tall
  characters on the same line stay in one cluster
- Filter outlier-height words (>2× median) when computing letter_h
  so brackets/IPA don't skew the row height
- Merge clusters closer than 0.4× median word height (definitely
  same text line despite slight center differences)
- Increased y_tolerance from 0.5× to 0.6× median word height
- Enhanced logging with cluster merge count and row height range

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-01 12:14:42 +01:00
parent 8ad5823fd8
commit 97d4355aa9

View File

@@ -1585,7 +1585,7 @@ def _regularize_row_grid(
return rows
# --- Step A: Group ALL words into line clusters ---
# Collect words that belong to content rows
# Collect words that belong to content rows (deduplicated)
content_words: List[Dict] = []
seen_keys: set = set()
for r in content_rows:
@@ -1598,26 +1598,54 @@ def _regularize_row_grid(
if len(content_words) < 5:
return rows
# Use half the median word height as grouping tolerance
word_heights = [w['height'] for w in content_words]
median_wh = sorted(word_heights)[len(word_heights) // 2]
y_tol = max(8, int(median_wh * 0.5))
# Compute median word height (excluding outliers like tall brackets/IPA)
word_heights = sorted(w['height'] for w in content_words)
median_wh = word_heights[len(word_heights) // 2]
line_clusters = _group_words_into_lines(content_words, y_tolerance_px=y_tol)
# Group by VERTICAL CENTER, not by top. Tall characters (brackets,
# phonetic symbols) have a much lower top but the same center_y as
# normal text on the same line. Grouping by top would split them
# into separate clusters → halved pitch → halved row heights.
y_tol = max(10, int(median_wh * 0.6))
# Sort by center_y, then group by proximity
words_by_center = sorted(content_words,
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
line_clusters: List[List[Dict]] = []
current_line: List[Dict] = [words_by_center[0]]
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
for w in words_by_center[1:]:
w_center = w['top'] + w['height'] / 2
if abs(w_center - current_center) <= y_tol:
current_line.append(w)
else:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
current_line = [w]
current_center = w_center
if current_line:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
if len(line_clusters) < 3:
return rows
# --- Step B: Compute center_y per cluster ---
# center_y = median of (word_top + word_height/2) across all words in cluster
# letter_h = median word height in cluster
# All coordinates are relative to content ROI (same as word_dicts)
# letter_h = median of word heights, but excluding outlier-height words
# (>2× median) so that tall brackets/IPA don't skew the height
cluster_info: List[Dict] = []
for cl_words in line_clusters:
centers = [w['top'] + w['height'] / 2 for w in cl_words]
heights = [w['height'] for w in cl_words]
# Filter outlier heights for letter_h computation
normal_heights = [w['height'] for w in cl_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in cl_words]
center_y = float(np.median(centers))
letter_h = float(np.median(heights))
letter_h = float(np.median(normal_heights))
cluster_info.append({
'center_y_rel': center_y, # relative to content ROI
'center_y_abs': center_y + top_y, # absolute
@@ -1627,6 +1655,34 @@ def _regularize_row_grid(
cluster_info.sort(key=lambda c: c['center_y_rel'])
# --- Step B2: Merge clusters that are too close together ---
# Even with center-based grouping, some edge cases can produce
# spurious clusters. Merge any pair whose centers are closer
# than 0.4× median_wh (they're definitely the same text line).
merge_threshold = max(5, median_wh * 0.4)
merged: List[Dict] = [cluster_info[0]]
for cl in cluster_info[1:]:
prev = merged[-1]
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
# Merge: combine words, recompute center
combined_words = prev['words'] + cl['words']
centers = [w['top'] + w['height'] / 2 for w in combined_words]
normal_heights = [w['height'] for w in combined_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in combined_words]
prev['center_y_rel'] = float(np.median(centers))
prev['center_y_abs'] = prev['center_y_rel'] + top_y
prev['letter_h'] = float(np.median(normal_heights))
prev['words'] = combined_words
else:
merged.append(cl)
cluster_info = merged
if len(cluster_info) < 3:
return rows
# --- Step C: Compute pitches and detect section breaks ---
pitches: List[float] = []
for i in range(1, len(cluster_info)):
@@ -1772,10 +1828,14 @@ def _regularize_row_grid(
for i, r in enumerate(result):
r.index = i
row_heights = [gr.height for gr in grid_rows]
min_h = min(row_heights) if row_heights else 0
max_h = max(row_heights) if row_heights else 0
logger.info(f"RowGrid: word-center grid applied "
f"(median_pitch={median_pitch:.0f}px, "
f"(median_pitch={median_pitch:.0f}px, median_wh={median_wh}px, "
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
f"{len(sections)} sections, "
f"{len(grid_rows)} grid rows, "
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
f"was {len(content_rows)} gap-based rows)")
return result