fix(ocr-pipeline): group words by vertical center, merge close clusters
Fix half-height rows caused by tall special characters (brackets, IPA symbols) being split into separate line clusters: - Group words by vertical CENTER instead of TOP position, so tall characters on the same line stay in one cluster - Filter outlier-height words (>2× median) when computing letter_h so brackets/IPA don't skew the row height - Merge clusters closer than 0.4× median word height (definitely same text line despite slight center differences) - Increased y_tolerance from 0.5× to 0.6× median word height - Enhanced logging with cluster merge count and row height range Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1585,7 +1585,7 @@ def _regularize_row_grid(
|
||||
return rows
|
||||
|
||||
# --- Step A: Group ALL words into line clusters ---
|
||||
# Collect words that belong to content rows
|
||||
# Collect words that belong to content rows (deduplicated)
|
||||
content_words: List[Dict] = []
|
||||
seen_keys: set = set()
|
||||
for r in content_rows:
|
||||
@@ -1598,26 +1598,54 @@ def _regularize_row_grid(
|
||||
if len(content_words) < 5:
|
||||
return rows
|
||||
|
||||
# Use half the median word height as grouping tolerance
|
||||
word_heights = [w['height'] for w in content_words]
|
||||
median_wh = sorted(word_heights)[len(word_heights) // 2]
|
||||
y_tol = max(8, int(median_wh * 0.5))
|
||||
# Compute median word height (excluding outliers like tall brackets/IPA)
|
||||
word_heights = sorted(w['height'] for w in content_words)
|
||||
median_wh = word_heights[len(word_heights) // 2]
|
||||
|
||||
line_clusters = _group_words_into_lines(content_words, y_tolerance_px=y_tol)
|
||||
# Group by VERTICAL CENTER, not by top. Tall characters (brackets,
|
||||
# phonetic symbols) have a much lower top but the same center_y as
|
||||
# normal text on the same line. Grouping by top would split them
|
||||
# into separate clusters → halved pitch → halved row heights.
|
||||
y_tol = max(10, int(median_wh * 0.6))
|
||||
|
||||
# Sort by center_y, then group by proximity
|
||||
words_by_center = sorted(content_words,
|
||||
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
|
||||
line_clusters: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [words_by_center[0]]
|
||||
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
|
||||
|
||||
for w in words_by_center[1:]:
|
||||
w_center = w['top'] + w['height'] / 2
|
||||
if abs(w_center - current_center) <= y_tol:
|
||||
current_line.append(w)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
current_line = [w]
|
||||
current_center = w_center
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
|
||||
if len(line_clusters) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step B: Compute center_y per cluster ---
|
||||
# center_y = median of (word_top + word_height/2) across all words in cluster
|
||||
# letter_h = median word height in cluster
|
||||
# All coordinates are relative to content ROI (same as word_dicts)
|
||||
# letter_h = median of word heights, but excluding outlier-height words
|
||||
# (>2× median) so that tall brackets/IPA don't skew the height
|
||||
cluster_info: List[Dict] = []
|
||||
for cl_words in line_clusters:
|
||||
centers = [w['top'] + w['height'] / 2 for w in cl_words]
|
||||
heights = [w['height'] for w in cl_words]
|
||||
# Filter outlier heights for letter_h computation
|
||||
normal_heights = [w['height'] for w in cl_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in cl_words]
|
||||
center_y = float(np.median(centers))
|
||||
letter_h = float(np.median(heights))
|
||||
letter_h = float(np.median(normal_heights))
|
||||
cluster_info.append({
|
||||
'center_y_rel': center_y, # relative to content ROI
|
||||
'center_y_abs': center_y + top_y, # absolute
|
||||
@@ -1627,6 +1655,34 @@ def _regularize_row_grid(
|
||||
|
||||
cluster_info.sort(key=lambda c: c['center_y_rel'])
|
||||
|
||||
# --- Step B2: Merge clusters that are too close together ---
|
||||
# Even with center-based grouping, some edge cases can produce
|
||||
# spurious clusters. Merge any pair whose centers are closer
|
||||
# than 0.4× median_wh (they're definitely the same text line).
|
||||
merge_threshold = max(5, median_wh * 0.4)
|
||||
merged: List[Dict] = [cluster_info[0]]
|
||||
for cl in cluster_info[1:]:
|
||||
prev = merged[-1]
|
||||
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
|
||||
# Merge: combine words, recompute center
|
||||
combined_words = prev['words'] + cl['words']
|
||||
centers = [w['top'] + w['height'] / 2 for w in combined_words]
|
||||
normal_heights = [w['height'] for w in combined_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in combined_words]
|
||||
prev['center_y_rel'] = float(np.median(centers))
|
||||
prev['center_y_abs'] = prev['center_y_rel'] + top_y
|
||||
prev['letter_h'] = float(np.median(normal_heights))
|
||||
prev['words'] = combined_words
|
||||
else:
|
||||
merged.append(cl)
|
||||
|
||||
cluster_info = merged
|
||||
|
||||
if len(cluster_info) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step C: Compute pitches and detect section breaks ---
|
||||
pitches: List[float] = []
|
||||
for i in range(1, len(cluster_info)):
|
||||
@@ -1772,10 +1828,14 @@ def _regularize_row_grid(
|
||||
for i, r in enumerate(result):
|
||||
r.index = i
|
||||
|
||||
row_heights = [gr.height for gr in grid_rows]
|
||||
min_h = min(row_heights) if row_heights else 0
|
||||
max_h = max(row_heights) if row_heights else 0
|
||||
logger.info(f"RowGrid: word-center grid applied "
|
||||
f"(median_pitch={median_pitch:.0f}px, "
|
||||
f"(median_pitch={median_pitch:.0f}px, median_wh={median_wh}px, "
|
||||
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
|
||||
f"{len(sections)} sections, "
|
||||
f"{len(grid_rows)} grid rows, "
|
||||
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
|
||||
f"was {len(content_rows)} gap-based rows)")
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user