From 7263328edb577ee1679916517a60b630e7685a95 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 11 Apr 2026 21:24:25 +0200 Subject: [PATCH] Fix marker column detection: remove min-rows requirement Words to the left of the first detected column boundary must always form their own column, regardless of how few rows they appear in. Previously required 4+ distinct rows for tertiary (margin) columns, which missed page references like p.62, p.63, p.64 (only 3 rows). Now any cluster at the left/right margin with a clear gap to the nearest significant column qualifies as its own column. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_helpers.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index a025fac..8100dc2 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -375,13 +375,17 @@ def _cluster_columns_by_alignment( used_ids = {id(c) for c in primary} | {id(c) for c in secondary} sig_xs = [c["mean_x"] for c in primary + secondary] - MIN_DISTINCT_ROWS_TERTIARY = max(MIN_DISTINCT_ROWS + 1, 4) - MIN_COVERAGE_TERTIARY = 0.05 # at least 5% of rows + # Tertiary: clusters that are clearly to the LEFT of the first + # significant column (or RIGHT of the last). If words consistently + # start at a position left of the established first column boundary, + # they MUST be a separate column — regardless of how few rows they + # cover. The only requirement is a clear spatial gap. + MIN_COVERAGE_TERTIARY = 0.02 # at least 1 row effectively tertiary = [] for c in clusters: if id(c) in used_ids: continue - if c["distinct_rows"] < MIN_DISTINCT_ROWS_TERTIARY: + if c["distinct_rows"] < 1: continue if c["row_coverage"] < MIN_COVERAGE_TERTIARY: continue