From 7b294f91509587ca987c0a1e05acb3e70b9fcbe6 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 12 Apr 2026 23:58:15 +0200 Subject: [PATCH] Cap gap_threshold at 25% of zone_w for column detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In small zones (boxes), intra-phrase gaps inflate the median gap, causing gap_threshold to become too large to detect real column boundaries. Cap at 25% of zone width to prevent this. Example: Box "Pounds and euros" has 4 columns at x≈148,534,751,1137 but gap_threshold was 531 (larger than the column gaps themselves). Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_helpers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index ba1ae36..e5165ae 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -282,6 +282,12 @@ def _cluster_columns_by_alignment( median_h = sorted(heights)[len(heights) // 2] if heights else 25 # Column boundary: gap > 3× median gap or > 1.5× median word height gap_threshold = max(median_gap * 3, median_h * 1.5, 30) + # Cap at 25% of zone width — prevents over-merging in small zones (boxes) + # where intra-phrase gaps can inflate the median + max_gap = zone_w * 0.25 + if gap_threshold > max_gap > 30: + logger.info("alignment columns: capping gap_threshold %.0f → %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w) + gap_threshold = max_gap else: gap_threshold = 50