diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 55ec82a..6e4a6be 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1585,7 +1585,7 @@ def _regularize_row_grid( return rows # --- Step A: Group ALL words into line clusters --- - # Collect words that belong to content rows + # Collect words that belong to content rows (deduplicated) content_words: List[Dict] = [] seen_keys: set = set() for r in content_rows: @@ -1598,26 +1598,54 @@ def _regularize_row_grid( if len(content_words) < 5: return rows - # Use half the median word height as grouping tolerance - word_heights = [w['height'] for w in content_words] - median_wh = sorted(word_heights)[len(word_heights) // 2] - y_tol = max(8, int(median_wh * 0.5)) + # Compute median word height (excluding outliers like tall brackets/IPA) + word_heights = sorted(w['height'] for w in content_words) + median_wh = word_heights[len(word_heights) // 2] - line_clusters = _group_words_into_lines(content_words, y_tolerance_px=y_tol) + # Group by VERTICAL CENTER, not by top. Tall characters (brackets, + # phonetic symbols) have a much lower top but the same center_y as + # normal text on the same line. Grouping by top would split them + # into separate clusters → halved pitch → halved row heights. + y_tol = max(10, int(median_wh * 0.6)) + + # Sort by center_y, then group by proximity + words_by_center = sorted(content_words, + key=lambda w: (w['top'] + w['height'] / 2, w['left'])) + line_clusters: List[List[Dict]] = [] + current_line: List[Dict] = [words_by_center[0]] + current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2 + + for w in words_by_center[1:]: + w_center = w['top'] + w['height'] / 2 + if abs(w_center - current_center) <= y_tol: + current_line.append(w) + else: + current_line.sort(key=lambda w: w['left']) + line_clusters.append(current_line) + current_line = [w] + current_center = w_center + + if current_line: + current_line.sort(key=lambda w: w['left']) + line_clusters.append(current_line) if len(line_clusters) < 3: return rows # --- Step B: Compute center_y per cluster --- # center_y = median of (word_top + word_height/2) across all words in cluster - # letter_h = median word height in cluster - # All coordinates are relative to content ROI (same as word_dicts) + # letter_h = median of word heights, but excluding outlier-height words + # (>2× median) so that tall brackets/IPA don't skew the height cluster_info: List[Dict] = [] for cl_words in line_clusters: centers = [w['top'] + w['height'] / 2 for w in cl_words] - heights = [w['height'] for w in cl_words] + # Filter outlier heights for letter_h computation + normal_heights = [w['height'] for w in cl_words + if w['height'] <= median_wh * 2.0] + if not normal_heights: + normal_heights = [w['height'] for w in cl_words] center_y = float(np.median(centers)) - letter_h = float(np.median(heights)) + letter_h = float(np.median(normal_heights)) cluster_info.append({ 'center_y_rel': center_y, # relative to content ROI 'center_y_abs': center_y + top_y, # absolute @@ -1627,6 +1655,34 @@ def _regularize_row_grid( cluster_info.sort(key=lambda c: c['center_y_rel']) + # --- Step B2: Merge clusters that are too close together --- + # Even with center-based grouping, some edge cases can produce + # spurious clusters. Merge any pair whose centers are closer + # than 0.4× median_wh (they're definitely the same text line). + merge_threshold = max(5, median_wh * 0.4) + merged: List[Dict] = [cluster_info[0]] + for cl in cluster_info[1:]: + prev = merged[-1] + if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold: + # Merge: combine words, recompute center + combined_words = prev['words'] + cl['words'] + centers = [w['top'] + w['height'] / 2 for w in combined_words] + normal_heights = [w['height'] for w in combined_words + if w['height'] <= median_wh * 2.0] + if not normal_heights: + normal_heights = [w['height'] for w in combined_words] + prev['center_y_rel'] = float(np.median(centers)) + prev['center_y_abs'] = prev['center_y_rel'] + top_y + prev['letter_h'] = float(np.median(normal_heights)) + prev['words'] = combined_words + else: + merged.append(cl) + + cluster_info = merged + + if len(cluster_info) < 3: + return rows + # --- Step C: Compute pitches and detect section breaks --- pitches: List[float] = [] for i in range(1, len(cluster_info)): @@ -1772,10 +1828,14 @@ def _regularize_row_grid( for i, r in enumerate(result): r.index = i + row_heights = [gr.height for gr in grid_rows] + min_h = min(row_heights) if row_heights else 0 + max_h = max(row_heights) if row_heights else 0 logger.info(f"RowGrid: word-center grid applied " - f"(median_pitch={median_pitch:.0f}px, " + f"(median_pitch={median_pitch:.0f}px, median_wh={median_wh}px, " + f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, " f"{len(sections)} sections, " - f"{len(grid_rows)} grid rows, " + f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], " f"was {len(content_rows)} gap-based rows)") return result