fix: relative bold detection (page median), fix save/finish buttons
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Bold detection: - Replace absolute threshold with page-level relative comparison - Measure stroke width for all cells, then mark cells >1.4× median as bold - Adapts automatically to font, DPI and scan quality Save buttons: - Fix status stuck on 'error' preventing re-click - Better error messages with response body - Fallback score to 0 when null Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4768,35 +4768,30 @@ def _clean_cell_text_lite(text: str) -> str:
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bold detection via stroke-width analysis
|
||||
# Bold detection via stroke-width analysis (relative / page-level)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_bold(gray_crop: np.ndarray) -> bool:
|
||||
"""Detect bold text by measuring mean stroke width in a binarised cell crop.
|
||||
def _measure_stroke_width(gray_crop: np.ndarray) -> float:
|
||||
"""Measure mean stroke width in a binarised cell crop.
|
||||
|
||||
Bold text has thicker strokes. We binarise (Otsu), skeletonise to get
|
||||
single-pixel strokes, then compute mean distance-transform value on the
|
||||
skeleton — that approximates half the stroke width. A value above the
|
||||
threshold indicates bold.
|
||||
|
||||
Returns True if the crop likely contains bold text.
|
||||
Returns a DPI-normalised value (mean stroke width as % of crop height),
|
||||
or 0.0 if measurement is not possible.
|
||||
"""
|
||||
if gray_crop is None or gray_crop.size == 0:
|
||||
return False
|
||||
return 0.0
|
||||
h, w = gray_crop.shape[:2]
|
||||
if h < 10 or w < 10:
|
||||
return False
|
||||
return 0.0
|
||||
|
||||
# Binarise: text = white (255), background = black (0)
|
||||
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
|
||||
if cv2.countNonZero(bw) < 20:
|
||||
return False
|
||||
return 0.0
|
||||
|
||||
# Distance transform: value at each white pixel = distance to nearest black
|
||||
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
|
||||
|
||||
# Skeleton via morphological thinning (approximate with erode-based approach)
|
||||
# Use thin iterations of erosion to approximate the medial axis
|
||||
# Skeleton via morphological thinning
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
|
||||
thin = bw.copy()
|
||||
for _ in range(max(1, min(h, w) // 6)):
|
||||
@@ -4805,16 +4800,52 @@ def _detect_bold(gray_crop: np.ndarray) -> bool:
|
||||
break
|
||||
thin = eroded
|
||||
|
||||
# Mean distance-transform value on the skeleton points
|
||||
skeleton_pts = thin > 0
|
||||
if not np.any(skeleton_pts):
|
||||
return False
|
||||
return 0.0
|
||||
mean_stroke = float(np.mean(dist[skeleton_pts]))
|
||||
return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
|
||||
|
||||
# Threshold: empirically, normal text ≈ 1.0–1.8, bold ≈ 2.0+
|
||||
# Scale by crop height to be DPI-independent
|
||||
normalised = mean_stroke / max(h, 1) * 100 # % of cell height
|
||||
return normalised > 3.5
|
||||
|
||||
def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
|
||||
img_w: int, img_h: int) -> None:
|
||||
"""Two-pass bold detection: measure all cells, then compare against median.
|
||||
|
||||
Cells with stroke width > 1.4× the page median are marked as bold.
|
||||
This adapts automatically to font, DPI and scan quality.
|
||||
Modifies cells in-place (sets 'is_bold' key).
|
||||
"""
|
||||
if ocr_img is None:
|
||||
return
|
||||
|
||||
# Pass 1: measure stroke width for every cell with text
|
||||
metrics: List[float] = []
|
||||
cell_strokes: List[float] = []
|
||||
for cell in cells:
|
||||
sw = 0.0
|
||||
if cell.get('text', '').strip():
|
||||
bp = cell['bbox_px']
|
||||
y1 = max(0, bp['y'])
|
||||
y2 = min(img_h, bp['y'] + bp['h'])
|
||||
x1 = max(0, bp['x'])
|
||||
x2 = min(img_w, bp['x'] + bp['w'])
|
||||
if y2 > y1 and x2 > x1:
|
||||
sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
|
||||
cell_strokes.append(sw)
|
||||
if sw > 0:
|
||||
metrics.append(sw)
|
||||
|
||||
if len(metrics) < 3:
|
||||
# Too few cells to compare — leave all as non-bold
|
||||
return
|
||||
|
||||
median_sw = float(np.median(metrics))
|
||||
if median_sw <= 0:
|
||||
return
|
||||
|
||||
# Pass 2: cells significantly above median → bold
|
||||
for cell, sw in zip(cells, cell_strokes):
|
||||
cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -5006,17 +5037,10 @@ def _ocr_cell_crop(
|
||||
row_idx, col_idx, pre_filter)
|
||||
avg_conf = 0.0
|
||||
|
||||
# --- Bold detection via stroke-width analysis ---
|
||||
is_bold = False
|
||||
if text.strip() and ocr_img is not None:
|
||||
gray_cell = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||
is_bold = _detect_bold(gray_cell)
|
||||
|
||||
result = dict(empty_cell)
|
||||
result['text'] = text
|
||||
result['confidence'] = avg_conf
|
||||
result['ocr_engine'] = used_engine
|
||||
result['is_bold'] = is_bold
|
||||
return result
|
||||
|
||||
|
||||
@@ -5163,16 +5187,6 @@ def build_cell_grid_v2(
|
||||
# Apply noise filter
|
||||
text = _clean_cell_text(text)
|
||||
|
||||
# Bold detection for broad columns
|
||||
is_bold = False
|
||||
if text.strip() and ocr_img is not None:
|
||||
bc_y = max(0, row.y)
|
||||
bc_h = min(img_h, row.y + row.height) - bc_y
|
||||
bc_x = max(0, col.x)
|
||||
bc_w = min(img_w, col.x + col.width) - bc_x
|
||||
if bc_h > 0 and bc_w > 0:
|
||||
is_bold = _detect_bold(ocr_img[bc_y:bc_y + bc_h, bc_x:bc_x + bc_w])
|
||||
|
||||
cell = {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
@@ -5191,7 +5205,7 @@ def build_cell_grid_v2(
|
||||
'h': round(row.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'ocr_engine': 'word_lookup',
|
||||
'is_bold': is_bold,
|
||||
'is_bold': False,
|
||||
}
|
||||
cells.append(cell)
|
||||
|
||||
@@ -5236,9 +5250,13 @@ def build_cell_grid_v2(
|
||||
if empty_rows_removed > 0:
|
||||
logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
|
||||
|
||||
# --- Page-level bold detection: compare stroke widths across all cells ---
|
||||
_classify_bold_cells(cells, ocr_img, img_w, img_h)
|
||||
bold_count = sum(1 for c in cells if c.get('is_bold'))
|
||||
|
||||
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
|
||||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||||
f"engine={engine_name} (hybrid)")
|
||||
f"engine={engine_name} (hybrid), {bold_count} bold")
|
||||
|
||||
return cells, columns_meta
|
||||
|
||||
|
||||
Reference in New Issue
Block a user