fix: relative bold detection (page median), fix save/finish buttons
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s

Bold detection:
- Replace absolute threshold with page-level relative comparison
- Measure stroke width for all cells, then mark cells >1.4× median as bold
- Adapts automatically to font, DPI and scan quality

Save buttons:
- Fix status stuck on 'error' preventing re-click
- Better error messages with response body
- Fallback score to 0 when null

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-05 13:02:16 +01:00
parent cd12755da6
commit 1a2efbf075
2 changed files with 68 additions and 43 deletions

View File

@@ -4768,35 +4768,30 @@ def _clean_cell_text_lite(text: str) -> str:
# ---------------------------------------------------------------------------
# Bold detection via stroke-width analysis
# Bold detection via stroke-width analysis (relative / page-level)
# ---------------------------------------------------------------------------
def _detect_bold(gray_crop: np.ndarray) -> bool:
"""Detect bold text by measuring mean stroke width in a binarised cell crop.
def _measure_stroke_width(gray_crop: np.ndarray) -> float:
"""Measure mean stroke width in a binarised cell crop.
Bold text has thicker strokes. We binarise (Otsu), skeletonise to get
single-pixel strokes, then compute mean distance-transform value on the
skeleton — that approximates half the stroke width. A value above the
threshold indicates bold.
Returns True if the crop likely contains bold text.
Returns a DPI-normalised value (mean stroke width as % of crop height),
or 0.0 if measurement is not possible.
"""
if gray_crop is None or gray_crop.size == 0:
return False
return 0.0
h, w = gray_crop.shape[:2]
if h < 10 or w < 10:
return False
return 0.0
# Binarise: text = white (255), background = black (0)
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
if cv2.countNonZero(bw) < 20:
return False
return 0.0
# Distance transform: value at each white pixel = distance to nearest black
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
# Skeleton via morphological thinning (approximate with erode-based approach)
# Use thin iterations of erosion to approximate the medial axis
# Skeleton via morphological thinning
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
thin = bw.copy()
for _ in range(max(1, min(h, w) // 6)):
@@ -4805,16 +4800,52 @@ def _detect_bold(gray_crop: np.ndarray) -> bool:
break
thin = eroded
# Mean distance-transform value on the skeleton points
skeleton_pts = thin > 0
if not np.any(skeleton_pts):
return False
return 0.0
mean_stroke = float(np.mean(dist[skeleton_pts]))
return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
# Threshold: empirically, normal text ≈ 1.01.8, bold ≈ 2.0+
# Scale by crop height to be DPI-independent
normalised = mean_stroke / max(h, 1) * 100 # % of cell height
return normalised > 3.5
def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
img_w: int, img_h: int) -> None:
"""Two-pass bold detection: measure all cells, then compare against median.
Cells with stroke width > 1.4× the page median are marked as bold.
This adapts automatically to font, DPI and scan quality.
Modifies cells in-place (sets 'is_bold' key).
"""
if ocr_img is None:
return
# Pass 1: measure stroke width for every cell with text
metrics: List[float] = []
cell_strokes: List[float] = []
for cell in cells:
sw = 0.0
if cell.get('text', '').strip():
bp = cell['bbox_px']
y1 = max(0, bp['y'])
y2 = min(img_h, bp['y'] + bp['h'])
x1 = max(0, bp['x'])
x2 = min(img_w, bp['x'] + bp['w'])
if y2 > y1 and x2 > x1:
sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
cell_strokes.append(sw)
if sw > 0:
metrics.append(sw)
if len(metrics) < 3:
# Too few cells to compare — leave all as non-bold
return
median_sw = float(np.median(metrics))
if median_sw <= 0:
return
# Pass 2: cells significantly above median → bold
for cell, sw in zip(cells, cell_strokes):
cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
# ---------------------------------------------------------------------------
@@ -5006,17 +5037,10 @@ def _ocr_cell_crop(
row_idx, col_idx, pre_filter)
avg_conf = 0.0
# --- Bold detection via stroke-width analysis ---
is_bold = False
if text.strip() and ocr_img is not None:
gray_cell = ocr_img[cy:cy + ch, cx:cx + cw]
is_bold = _detect_bold(gray_cell)
result = dict(empty_cell)
result['text'] = text
result['confidence'] = avg_conf
result['ocr_engine'] = used_engine
result['is_bold'] = is_bold
return result
@@ -5163,16 +5187,6 @@ def build_cell_grid_v2(
# Apply noise filter
text = _clean_cell_text(text)
# Bold detection for broad columns
is_bold = False
if text.strip() and ocr_img is not None:
bc_y = max(0, row.y)
bc_h = min(img_h, row.y + row.height) - bc_y
bc_x = max(0, col.x)
bc_w = min(img_w, col.x + col.width) - bc_x
if bc_h > 0 and bc_w > 0:
is_bold = _detect_bold(ocr_img[bc_y:bc_y + bc_h, bc_x:bc_x + bc_w])
cell = {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
@@ -5191,7 +5205,7 @@ def build_cell_grid_v2(
'h': round(row.height / img_h * 100, 2) if img_h else 0,
},
'ocr_engine': 'word_lookup',
'is_bold': is_bold,
'is_bold': False,
}
cells.append(cell)
@@ -5236,9 +5250,13 @@ def build_cell_grid_v2(
if empty_rows_removed > 0:
logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
# --- Page-level bold detection: compare stroke widths across all cells ---
_classify_bold_cells(cells, ocr_img, img_w, img_h)
bold_count = sum(1 for c in cells if c.get('is_bold'))
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
f"engine={engine_name} (hybrid)")
f"engine={engine_name} (hybrid), {bold_count} bold")
return cells, columns_meta