fix(ocr-pipeline): group words by vertical center, merge close clusters

Fix half-height rows caused by tall special characters (brackets, IPA symbols) being split into separate line clusters: - Group words by vertical CENTER instead of TOP position, so tall characters on the same line stay in one cluster - Filter outlier-height words (>2× median) when computing letter_h so brackets/IPA don't skew the row height - Merge clusters closer than 0.4× median word height (definitely same text line despite slight center differences) - Increased y_tolerance from 0.5× to 0.6× median word height - Enhanced logging with cluster merge count and row height range Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 12:14:42 +01:00
parent 8ad5823fd8
commit 97d4355aa9
1 changed files with 72 additions and 12 deletions
@@ -1585,7 +1585,7 @@ def _regularize_row_grid(
        return rows

    # --- Step A: Group ALL words into line clusters ---
-    # Collect words that belong to content rows
+    # Collect words that belong to content rows (deduplicated)
    content_words: List[Dict] = []
    seen_keys: set = set()
    for r in content_rows:
@@ -1598,26 +1598,54 @@ def _regularize_row_grid(
    if len(content_words) < 5:
        return rows

-    # Use half the median word height as grouping tolerance
-    word_heights = [w['height'] for w in content_words]
-    median_wh = sorted(word_heights)[len(word_heights) // 2]
-    y_tol = max(8, int(median_wh * 0.5))
+    # Compute median word height (excluding outliers like tall brackets/IPA)
+    word_heights = sorted(w['height'] for w in content_words)
+    median_wh = word_heights[len(word_heights) // 2]

-    line_clusters = _group_words_into_lines(content_words, y_tolerance_px=y_tol)
+    # Group by VERTICAL CENTER, not by top.  Tall characters (brackets,
+    # phonetic symbols) have a much lower top but the same center_y as
+    # normal text on the same line.  Grouping by top would split them
+    # into separate clusters → halved pitch → halved row heights.
+    y_tol = max(10, int(median_wh * 0.6))
+
+    # Sort by center_y, then group by proximity
+    words_by_center = sorted(content_words,
+                             key=lambda w: (w['top'] + w['height'] / 2, w['left']))
+    line_clusters: List[List[Dict]] = []
+    current_line: List[Dict] = [words_by_center[0]]
+    current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
+
+    for w in words_by_center[1:]:
+        w_center = w['top'] + w['height'] / 2
+        if abs(w_center - current_center) <= y_tol:
+            current_line.append(w)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            line_clusters.append(current_line)
+            current_line = [w]
+            current_center = w_center
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        line_clusters.append(current_line)

    if len(line_clusters) < 3:
        return rows

    # --- Step B: Compute center_y per cluster ---
    # center_y = median of (word_top + word_height/2) across all words in cluster
-    # letter_h = median word height in cluster
-    # All coordinates are relative to content ROI (same as word_dicts)
+    # letter_h = median of word heights, but excluding outlier-height words
+    #            (>2× median) so that tall brackets/IPA don't skew the height
    cluster_info: List[Dict] = []
    for cl_words in line_clusters:
        centers = [w['top'] + w['height'] / 2 for w in cl_words]
-        heights = [w['height'] for w in cl_words]
+        # Filter outlier heights for letter_h computation
+        normal_heights = [w['height'] for w in cl_words
+                          if w['height'] <= median_wh * 2.0]
+        if not normal_heights:
+            normal_heights = [w['height'] for w in cl_words]
        center_y = float(np.median(centers))
-        letter_h = float(np.median(heights))
+        letter_h = float(np.median(normal_heights))
        cluster_info.append({
            'center_y_rel': center_y,  # relative to content ROI
            'center_y_abs': center_y + top_y,  # absolute
@@ -1627,6 +1655,34 @@ def _regularize_row_grid(

    cluster_info.sort(key=lambda c: c['center_y_rel'])

+    # --- Step B2: Merge clusters that are too close together ---
+    # Even with center-based grouping, some edge cases can produce
+    # spurious clusters.  Merge any pair whose centers are closer
+    # than 0.4× median_wh (they're definitely the same text line).
+    merge_threshold = max(5, median_wh * 0.4)
+    merged: List[Dict] = [cluster_info[0]]
+    for cl in cluster_info[1:]:
+        prev = merged[-1]
+        if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
+            # Merge: combine words, recompute center
+            combined_words = prev['words'] + cl['words']
+            centers = [w['top'] + w['height'] / 2 for w in combined_words]
+            normal_heights = [w['height'] for w in combined_words
+                              if w['height'] <= median_wh * 2.0]
+            if not normal_heights:
+                normal_heights = [w['height'] for w in combined_words]
+            prev['center_y_rel'] = float(np.median(centers))
+            prev['center_y_abs'] = prev['center_y_rel'] + top_y
+            prev['letter_h'] = float(np.median(normal_heights))
+            prev['words'] = combined_words
+        else:
+            merged.append(cl)
+
+    cluster_info = merged
+
+    if len(cluster_info) < 3:
+        return rows
+
    # --- Step C: Compute pitches and detect section breaks ---
    pitches: List[float] = []
    for i in range(1, len(cluster_info)):
@@ -1772,10 +1828,14 @@ def _regularize_row_grid(
    for i, r in enumerate(result):
        r.index = i

+    row_heights = [gr.height for gr in grid_rows]
+    min_h = min(row_heights) if row_heights else 0
+    max_h = max(row_heights) if row_heights else 0
    logger.info(f"RowGrid: word-center grid applied "
-                f"(median_pitch={median_pitch:.0f}px, "
+                f"(median_pitch={median_pitch:.0f}px, median_wh={median_wh}px, "
+                f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
                f"{len(sections)} sections, "
-                f"{len(grid_rows)} grid rows, "
+                f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
                f"was {len(content_rows)} gap-based rows)")

    return result