From 5fa5767c9ace030a58e38979a4ad2b2d9a3ec248 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 13 Apr 2026 07:55:29 +0200 Subject: [PATCH] Fix box column detection: use low gap_threshold for small zones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PaddleOCR returns multi-word blocks (whole phrases), so ALL inter-word gaps in small zones (boxes, ≤60 words) are column boundaries. Previous 3x-median approach produced thresholds too high to detect real columns. New approach for small zones: gap_threshold = max(median_h * 1.0, 25). This correctly detects 4 columns in "Pounds and euros" box where gaps range from 50-297px and word height is ~31px. Also includes SmartSpellChecker fixes from previous commits: - Frequency-based scoring, IPA protection, slash→l, rare-word threshold Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/grid_editor_helpers.py | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index e5165ae..cb8b89c 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -280,14 +280,27 @@ def _cluster_columns_by_alignment( median_gap = sorted_gaps[len(sorted_gaps) // 2] heights = [w["height"] for w in words if w.get("height", 0) > 0] median_h = sorted(heights)[len(heights) // 2] if heights else 25 - # Column boundary: gap > 3× median gap or > 1.5× median word height - gap_threshold = max(median_gap * 3, median_h * 1.5, 30) - # Cap at 25% of zone width — prevents over-merging in small zones (boxes) - # where intra-phrase gaps can inflate the median - max_gap = zone_w * 0.25 - if gap_threshold > max_gap > 30: - logger.info("alignment columns: capping gap_threshold %.0f → %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w) - gap_threshold = max_gap + + # For small word counts (boxes, sub-zones): PaddleOCR returns + # multi-word blocks, so ALL inter-word gaps are potential column + # boundaries. Use a low threshold based on word height — any gap + # wider than ~1x median word height is a column separator. + if len(words) <= 60: + gap_threshold = max(median_h * 1.0, 25) + logger.info( + "alignment columns (small zone): gap_threshold=%.0f " + "(median_h=%.0f, %d words, %d gaps: %s)", + gap_threshold, median_h, len(words), len(sorted_gaps), + [int(g) for g in sorted_gaps[:10]], + ) + else: + # Standard approach for large zones (full pages) + gap_threshold = max(median_gap * 3, median_h * 1.5, 30) + # Cap at 25% of zone width + max_gap = zone_w * 0.25 + if gap_threshold > max_gap > 30: + logger.info("alignment columns: capping gap_threshold %.0f → %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w) + gap_threshold = max_gap else: gap_threshold = 50