From 8e861e5a4d3086a3bf6b4246ed8115e91d5e262b Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Sun, 1 Mar 2026 12:34:15 +0100
Subject: [PATCH] fix(ocr-pipeline): use gap-based row height for cluster
 tolerance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The y_tolerance for word-center clustering was based on median word
height (21px → 12px tolerance), which was too small. Words on the
same line can have centers 15-20px apart due to different heights.

Now uses 40% of the gap-based median row height as tolerance (e.g.
40px row → 16px tolerance), and 30% for merge threshold. This
produces correct cluster counts matching actual text lines.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 23 +++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 6e4a6be..b2cc866 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1602,11 +1602,18 @@ def _regularize_row_grid(
     word_heights = sorted(w['height'] for w in content_words)
     median_wh = word_heights[len(word_heights) // 2]
 
-    # Group by VERTICAL CENTER, not by top.  Tall characters (brackets,
-    # phonetic symbols) have a much lower top but the same center_y as
-    # normal text on the same line.  Grouping by top would split them
-    # into separate clusters → halved pitch → halved row heights.
-    y_tol = max(10, int(median_wh * 0.6))
+    # Compute median gap-based row height — this is the actual line height
+    # as detected by the horizontal projection.  We use 40% of this as
+    # grouping tolerance.  This is much more reliable than using word height
+    # alone, because words on the same line can have very different heights
+    # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
+    gap_row_heights = sorted(r.height for r in content_rows)
+    median_row_h = gap_row_heights[len(gap_row_heights) // 2]
+
+    # Tolerance: 40% of row height.  Words on the same line should have
+    # centers within this range.  Even if a word's bbox is taller/shorter,
+    # its center should stay within half a row height of the line center.
+    y_tol = max(10, int(median_row_h * 0.4))
 
     # Sort by center_y, then group by proximity
     words_by_center = sorted(content_words,
@@ -1658,8 +1665,8 @@ def _regularize_row_grid(
     # --- Step B2: Merge clusters that are too close together ---
     # Even with center-based grouping, some edge cases can produce
     # spurious clusters.  Merge any pair whose centers are closer
-    # than 0.4× median_wh (they're definitely the same text line).
-    merge_threshold = max(5, median_wh * 0.4)
+    # than 30% of the row height (they're definitely the same text line).
+    merge_threshold = max(8, median_row_h * 0.3)
     merged: List[Dict] = [cluster_info[0]]
     for cl in cluster_info[1:]:
         prev = merged[-1]
@@ -1832,7 +1839,7 @@ def _regularize_row_grid(
     min_h = min(row_heights) if row_heights else 0
     max_h = max(row_heights) if row_heights else 0
     logger.info(f"RowGrid: word-center grid applied "
-                f"(median_pitch={median_pitch:.0f}px, median_wh={median_wh}px, "
+                f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
                 f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
                 f"{len(sections)} sections, "
                 f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "