From 8e861e5a4d3086a3bf6b4246ed8115e91d5e262b Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 1 Mar 2026 12:34:15 +0100 Subject: [PATCH] fix(ocr-pipeline): use gap-based row height for cluster tolerance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The y_tolerance for word-center clustering was based on median word height (21px → 12px tolerance), which was too small. Words on the same line can have centers 15-20px apart due to different heights. Now uses 40% of the gap-based median row height as tolerance (e.g. 40px row → 16px tolerance), and 30% for merge threshold. This produces correct cluster counts matching actual text lines. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 23 +++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 6e4a6be..b2cc866 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1602,11 +1602,18 @@ def _regularize_row_grid( word_heights = sorted(w['height'] for w in content_words) median_wh = word_heights[len(word_heights) // 2] - # Group by VERTICAL CENTER, not by top. Tall characters (brackets, - # phonetic symbols) have a much lower top but the same center_y as - # normal text on the same line. Grouping by top would split them - # into separate clusters → halved pitch → halved row heights. - y_tol = max(10, int(median_wh * 0.6)) + # Compute median gap-based row height — this is the actual line height + # as detected by the horizontal projection. We use 40% of this as + # grouping tolerance. This is much more reliable than using word height + # alone, because words on the same line can have very different heights + # (e.g. lowercase vs uppercase, brackets, phonetic symbols). + gap_row_heights = sorted(r.height for r in content_rows) + median_row_h = gap_row_heights[len(gap_row_heights) // 2] + + # Tolerance: 40% of row height. Words on the same line should have + # centers within this range. Even if a word's bbox is taller/shorter, + # its center should stay within half a row height of the line center. + y_tol = max(10, int(median_row_h * 0.4)) # Sort by center_y, then group by proximity words_by_center = sorted(content_words, @@ -1658,8 +1665,8 @@ def _regularize_row_grid( # --- Step B2: Merge clusters that are too close together --- # Even with center-based grouping, some edge cases can produce # spurious clusters. Merge any pair whose centers are closer - # than 0.4× median_wh (they're definitely the same text line). - merge_threshold = max(5, median_wh * 0.4) + # than 30% of the row height (they're definitely the same text line). + merge_threshold = max(8, median_row_h * 0.3) merged: List[Dict] = [cluster_info[0]] for cl in cluster_info[1:]: prev = merged[-1] @@ -1832,7 +1839,7 @@ def _regularize_row_grid( min_h = min(row_heights) if row_heights else 0 max_h = max(row_heights) if row_heights else 0 logger.info(f"RowGrid: word-center grid applied " - f"(median_pitch={median_pitch:.0f}px, median_wh={median_wh}px, " + f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, " f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, " f"{len(sections)} sections, " f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "