fix: relative bold detection (page median), fix save/finish buttons

Bold detection: - Replace absolute threshold with page-level relative comparison - Measure stroke width for all cells, then mark cells >1.4× median as bold - Adapts automatically to font, DPI and scan quality Save buttons: - Fix status stuck on 'error' preventing re-click - Better error messages with response body - Fallback score to 0 when null Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 13:02:16 +01:00
parent cd12755da6
commit 1a2efbf075
2 changed files with 68 additions and 43 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -4768,35 +4768,30 @@ def _clean_cell_text_lite(text: str) -> str:


 # ---------------------------------------------------------------------------
-# Bold detection via stroke-width analysis
+# Bold detection via stroke-width analysis (relative / page-level)
 # ---------------------------------------------------------------------------

-def _detect_bold(gray_crop: np.ndarray) -> bool:
-    """Detect bold text by measuring mean stroke width in a binarised cell crop.
+def _measure_stroke_width(gray_crop: np.ndarray) -> float:
+    """Measure mean stroke width in a binarised cell crop.

-    Bold text has thicker strokes.  We binarise (Otsu), skeletonise to get
-    single-pixel strokes, then compute mean distance-transform value on the
-    skeleton — that approximates half the stroke width.  A value above the
-    threshold indicates bold.
-
-    Returns True if the crop likely contains bold text.
+    Returns a DPI-normalised value (mean stroke width as % of crop height),
+    or 0.0 if measurement is not possible.
    """
    if gray_crop is None or gray_crop.size == 0:
-        return False
+        return 0.0
    h, w = gray_crop.shape[:2]
    if h < 10 or w < 10:
-        return False
+        return 0.0

    # Binarise: text = white (255), background = black (0)
    _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
    if cv2.countNonZero(bw) < 20:
-        return False
+        return 0.0

    # Distance transform: value at each white pixel = distance to nearest black
    dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)

-    # Skeleton via morphological thinning (approximate with erode-based approach)
-    # Use thin iterations of erosion to approximate the medial axis
+    # Skeleton via morphological thinning
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
    thin = bw.copy()
    for _ in range(max(1, min(h, w) // 6)):
@@ -4805,16 +4800,52 @@ def _detect_bold(gray_crop: np.ndarray) -> bool:
            break
        thin = eroded

-    # Mean distance-transform value on the skeleton points
    skeleton_pts = thin > 0
    if not np.any(skeleton_pts):
-        return False
+        return 0.0
    mean_stroke = float(np.mean(dist[skeleton_pts]))
+    return mean_stroke / max(h, 1) * 100  # normalised: % of cell height

-    # Threshold: empirically, normal text ≈ 1.0–1.8, bold ≈ 2.0+
-    # Scale by crop height to be DPI-independent
-    normalised = mean_stroke / max(h, 1) * 100  # % of cell height
-    return normalised > 3.5
+
+def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
+                         img_w: int, img_h: int) -> None:
+    """Two-pass bold detection: measure all cells, then compare against median.
+
+    Cells with stroke width > 1.4× the page median are marked as bold.
+    This adapts automatically to font, DPI and scan quality.
+    Modifies cells in-place (sets 'is_bold' key).
+    """
+    if ocr_img is None:
+        return
+
+    # Pass 1: measure stroke width for every cell with text
+    metrics: List[float] = []
+    cell_strokes: List[float] = []
+    for cell in cells:
+        sw = 0.0
+        if cell.get('text', '').strip():
+            bp = cell['bbox_px']
+            y1 = max(0, bp['y'])
+            y2 = min(img_h, bp['y'] + bp['h'])
+            x1 = max(0, bp['x'])
+            x2 = min(img_w, bp['x'] + bp['w'])
+            if y2 > y1 and x2 > x1:
+                sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
+        cell_strokes.append(sw)
+        if sw > 0:
+            metrics.append(sw)
+
+    if len(metrics) < 3:
+        # Too few cells to compare — leave all as non-bold
+        return
+
+    median_sw = float(np.median(metrics))
+    if median_sw <= 0:
+        return
+
+    # Pass 2: cells significantly above median → bold
+    for cell, sw in zip(cells, cell_strokes):
+        cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4


 # ---------------------------------------------------------------------------
@@ -5006,17 +5037,10 @@ def _ocr_cell_crop(
                        row_idx, col_idx, pre_filter)
            avg_conf = 0.0

-    # --- Bold detection via stroke-width analysis ---
-    is_bold = False
-    if text.strip() and ocr_img is not None:
-        gray_cell = ocr_img[cy:cy + ch, cx:cx + cw]
-        is_bold = _detect_bold(gray_cell)
-
    result = dict(empty_cell)
    result['text'] = text
    result['confidence'] = avg_conf
    result['ocr_engine'] = used_engine
-    result['is_bold'] = is_bold
    return result


@@ -5163,16 +5187,6 @@ def build_cell_grid_v2(
                # Apply noise filter
                text = _clean_cell_text(text)

-                # Bold detection for broad columns
-                is_bold = False
-                if text.strip() and ocr_img is not None:
-                    bc_y = max(0, row.y)
-                    bc_h = min(img_h, row.y + row.height) - bc_y
-                    bc_x = max(0, col.x)
-                    bc_w = min(img_w, col.x + col.width) - bc_x
-                    if bc_h > 0 and bc_w > 0:
-                        is_bold = _detect_bold(ocr_img[bc_y:bc_y + bc_h, bc_x:bc_x + bc_w])
-
                cell = {
                    'cell_id': f"R{row_idx:02d}_C{col_idx}",
                    'row_index': row_idx,
@@ -5191,7 +5205,7 @@ def build_cell_grid_v2(
                        'h': round(row.height / img_h * 100, 2) if img_h else 0,
                    },
                    'ocr_engine': 'word_lookup',
-                    'is_bold': is_bold,
+                    'is_bold': False,
                }
                cells.append(cell)

@@ -5236,9 +5250,13 @@ def build_cell_grid_v2(
    if empty_rows_removed > 0:
        logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")

+    # --- Page-level bold detection: compare stroke widths across all cells ---
+    _classify_bold_cells(cells, ocr_img, img_w, img_h)
+    bold_count = sum(1 for c in cells if c.get('is_bold'))
+
    logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
                f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
-                f"engine={engine_name} (hybrid)")
+                f"engine={engine_name} (hybrid), {bold_count} bold")

    return cells, columns_meta