From 1162eac7b48442a5b638c3673982ca4aa41002c4 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 15 Mar 2026 00:10:29 +0100 Subject: [PATCH] fix: use group-start positions for column detection, not all word left-edges Only cluster left-edges of words that begin a new group within their row (first word or preceded by a large gap). This filters out mid-phrase word positions (IPA transcriptions, second words in multi-word entries) that were causing too many false columns. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 86 ++++++++++++++++------ 1 file changed, 63 insertions(+), 23 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 4e1173f..0898b19 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -43,16 +43,17 @@ def _cluster_columns_by_alignment( ) -> List[Dict[str, Any]]: """Detect columns by clustering left-edge alignment across rows. - Algorithm (adapted from cv_layout._detect_columns_by_clustering): - 1. Tag each word with its row index - 2. Cluster word left-edges by X-proximity - 3. Count distinct rows per cluster (Y-coverage) - 4. Keep clusters with sufficient row coverage - 5. Merge nearby clusters - 6. Build column boundaries + Hybrid approach: + 1. Group words by row, find "group start" positions within each row + (words preceded by a large gap or first word in row) + 2. Cluster group-start left-edges by X-proximity across rows + 3. Filter by row coverage (how many rows have a group start here) + 4. Merge nearby clusters + 5. Build column boundaries - With real OCR words (from Kombi mode) this is more reliable than the - original ink-based version because left-edge positions are precise. + This filters out mid-phrase word positions (e.g. IPA transcriptions, + second words in multi-word entries) by only considering positions + where a new word group begins within a row. """ if not words or not rows: return [] @@ -61,26 +62,65 @@ def _cluster_columns_by_alignment( if total_rows == 0: return [] - # --- Tag each word with its row index --- - row_of: Dict[int, int] = {} + # --- Group words by row --- + row_words: Dict[int, List[Dict]] = {} for w in words: y_center = w["top"] + w["height"] / 2 best = min(rows, key=lambda r: abs(r["y_center"] - y_center)) - row_of[id(w)] = best["index"] + row_words.setdefault(best["index"], []).append(w) - # --- Collect and sort left-edges --- - edge_data = sorted( - ((w["left"], row_of[id(w)]) for w in words), - key=lambda x: x[0], + # --- Compute adaptive gap threshold for group-start detection --- + all_gaps: List[float] = [] + for ri, rw_list in row_words.items(): + sorted_rw = sorted(rw_list, key=lambda w: w["left"]) + for i in range(len(sorted_rw) - 1): + right = sorted_rw[i]["left"] + sorted_rw[i]["width"] + gap = sorted_rw[i + 1]["left"] - right + if gap > 0: + all_gaps.append(gap) + + if all_gaps: + sorted_gaps = sorted(all_gaps) + median_gap = sorted_gaps[len(sorted_gaps) // 2] + heights = [w["height"] for w in words if w.get("height", 0) > 0] + median_h = sorted(heights)[len(heights) // 2] if heights else 25 + # Column boundary: gap > 3× median gap or > 1.5× median word height + gap_threshold = max(median_gap * 3, median_h * 1.5, 30) + else: + gap_threshold = 50 + + # --- Find group-start positions (left-edges that begin a new column) --- + start_positions: List[tuple] = [] # (left_edge, row_index) + for ri, rw_list in row_words.items(): + sorted_rw = sorted(rw_list, key=lambda w: w["left"]) + # First word in row is always a group start + start_positions.append((sorted_rw[0]["left"], ri)) + for i in range(1, len(sorted_rw)): + right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"] + gap = sorted_rw[i]["left"] - right_prev + if gap >= gap_threshold: + start_positions.append((sorted_rw[i]["left"], ri)) + + start_positions.sort(key=lambda x: x[0]) + + logger.info( + "alignment columns: %d group-start positions from %d words " + "(gap_threshold=%.0f, %d rows)", + len(start_positions), len(words), gap_threshold, total_rows, ) - # --- Cluster by X-proximity --- + if not start_positions: + x_min = min(w["left"] for w in words) + x_max = max(w["left"] + w["width"] for w in words) + return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}] + + # --- Cluster group-start positions by X-proximity --- tolerance = max(10, int(zone_w * 0.01)) clusters: List[Dict[str, Any]] = [] - cur_edges = [edge_data[0][0]] - cur_rows = {edge_data[0][1]} + cur_edges = [start_positions[0][0]] + cur_rows = {start_positions[0][1]} - for left, row_idx in edge_data[1:]: + for left, row_idx in start_positions[1:]: if left - cur_edges[-1] <= tolerance: cur_edges.append(left) cur_rows.add(row_idx) @@ -105,8 +145,8 @@ def _cluster_columns_by_alignment( }) # --- Filter by row coverage --- - MIN_COVERAGE_PRIMARY = 0.15 - MIN_COVERAGE_SECONDARY = 0.08 + MIN_COVERAGE_PRIMARY = 0.20 + MIN_COVERAGE_SECONDARY = 0.12 MIN_WORDS_SECONDARY = 3 MIN_DISTINCT_ROWS = 2 @@ -126,7 +166,7 @@ def _cluster_columns_by_alignment( significant = sorted(primary + secondary, key=lambda c: c["mean_x"]) logger.info( - "alignment columns: %d clusters total, %d primary, %d secondary → %d significant", + "alignment columns: %d clusters, %d primary, %d secondary → %d significant", len(clusters), len(primary), len(secondary), len(significant), )