diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index e5165ae..cb8b89c 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -280,14 +280,27 @@ def _cluster_columns_by_alignment( median_gap = sorted_gaps[len(sorted_gaps) // 2] heights = [w["height"] for w in words if w.get("height", 0) > 0] median_h = sorted(heights)[len(heights) // 2] if heights else 25 - # Column boundary: gap > 3× median gap or > 1.5× median word height - gap_threshold = max(median_gap * 3, median_h * 1.5, 30) - # Cap at 25% of zone width — prevents over-merging in small zones (boxes) - # where intra-phrase gaps can inflate the median - max_gap = zone_w * 0.25 - if gap_threshold > max_gap > 30: - logger.info("alignment columns: capping gap_threshold %.0f → %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w) - gap_threshold = max_gap + + # For small word counts (boxes, sub-zones): PaddleOCR returns + # multi-word blocks, so ALL inter-word gaps are potential column + # boundaries. Use a low threshold based on word height — any gap + # wider than ~1x median word height is a column separator. + if len(words) <= 60: + gap_threshold = max(median_h * 1.0, 25) + logger.info( + "alignment columns (small zone): gap_threshold=%.0f " + "(median_h=%.0f, %d words, %d gaps: %s)", + gap_threshold, median_h, len(words), len(sorted_gaps), + [int(g) for g in sorted_gaps[:10]], + ) + else: + # Standard approach for large zones (full pages) + gap_threshold = max(median_gap * 3, median_h * 1.5, 30) + # Cap at 25% of zone width + max_gap = zone_w * 0.25 + if gap_threshold > max_gap > 30: + logger.info("alignment columns: capping gap_threshold %.0f → %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w) + gap_threshold = max_gap else: gap_threshold = 50