feat: OCR umlaut confusion correction + bold detection via stroke-width
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m39s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m39s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 18s
- Add umlaut confusion rules (i→ü, a→ä, o→ö, u→ü) to _spell_fix_token for German text — fixes "iberqueren" → "überqueren" etc. - Add _detect_bold() using OpenCV stroke-width analysis on cell crops - Integrate bold detection in both narrow (cell-crop) and broad (word-lookup) paths - Add is_bold field to GridCell TypeScript interface - Render bold text in StepGroundTruth reconstruction view Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -184,6 +184,7 @@ export interface GridCell {
|
|||||||
bbox_px: WordBbox
|
bbox_px: WordBbox
|
||||||
bbox_pct: WordBbox
|
bbox_pct: WordBbox
|
||||||
ocr_engine?: string
|
ocr_engine?: string
|
||||||
|
is_bold?: boolean
|
||||||
status?: 'pending' | 'confirmed' | 'edited' | 'skipped'
|
status?: 'pending' | 'confirmed' | 'edited' | 'skipped'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -389,6 +389,7 @@ export function StepGroundTruth({ sessionId, onNext }: StepGroundTruthProps) {
|
|||||||
height: `${cell.bbox_pct.h}%`,
|
height: `${cell.bbox_pct.h}%`,
|
||||||
color: '#1a1a1a',
|
color: '#1a1a1a',
|
||||||
fontSize: `${fontSize}px`,
|
fontSize: `${fontSize}px`,
|
||||||
|
fontWeight: cell.is_bold ? 'bold' : 'normal',
|
||||||
fontFamily: "'Liberation Sans', 'DejaVu Sans', Arial, sans-serif",
|
fontFamily: "'Liberation Sans', 'DejaVu Sans', Arial, sans-serif",
|
||||||
display: 'flex',
|
display: 'flex',
|
||||||
alignItems: 'center',
|
alignItems: 'center',
|
||||||
|
|||||||
@@ -4767,6 +4767,56 @@ def _clean_cell_text_lite(text: str) -> str:
|
|||||||
return stripped
|
return stripped
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Bold detection via stroke-width analysis
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _detect_bold(gray_crop: np.ndarray) -> bool:
|
||||||
|
"""Detect bold text by measuring mean stroke width in a binarised cell crop.
|
||||||
|
|
||||||
|
Bold text has thicker strokes. We binarise (Otsu), skeletonise to get
|
||||||
|
single-pixel strokes, then compute mean distance-transform value on the
|
||||||
|
skeleton — that approximates half the stroke width. A value above the
|
||||||
|
threshold indicates bold.
|
||||||
|
|
||||||
|
Returns True if the crop likely contains bold text.
|
||||||
|
"""
|
||||||
|
if gray_crop is None or gray_crop.size == 0:
|
||||||
|
return False
|
||||||
|
h, w = gray_crop.shape[:2]
|
||||||
|
if h < 10 or w < 10:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Binarise: text = white (255), background = black (0)
|
||||||
|
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
|
||||||
|
if cv2.countNonZero(bw) < 20:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Distance transform: value at each white pixel = distance to nearest black
|
||||||
|
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
|
||||||
|
|
||||||
|
# Skeleton via morphological thinning (approximate with erode-based approach)
|
||||||
|
# Use thin iterations of erosion to approximate the medial axis
|
||||||
|
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
|
||||||
|
thin = bw.copy()
|
||||||
|
for _ in range(max(1, min(h, w) // 6)):
|
||||||
|
eroded = cv2.erode(thin, kernel)
|
||||||
|
if cv2.countNonZero(eroded) < 5:
|
||||||
|
break
|
||||||
|
thin = eroded
|
||||||
|
|
||||||
|
# Mean distance-transform value on the skeleton points
|
||||||
|
skeleton_pts = thin > 0
|
||||||
|
if not np.any(skeleton_pts):
|
||||||
|
return False
|
||||||
|
mean_stroke = float(np.mean(dist[skeleton_pts]))
|
||||||
|
|
||||||
|
# Threshold: empirically, normal text ≈ 1.0–1.8, bold ≈ 2.0+
|
||||||
|
# Scale by crop height to be DPI-independent
|
||||||
|
normalised = mean_stroke / max(h, 1) * 100 # % of cell height
|
||||||
|
return normalised > 3.5
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Cell-First OCR (v2) — each cell cropped and OCR'd in isolation
|
# Cell-First OCR (v2) — each cell cropped and OCR'd in isolation
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -4821,6 +4871,7 @@ def _ocr_cell_crop(
|
|||||||
'h': round(disp_h / img_h * 100, 2) if img_h else 0,
|
'h': round(disp_h / img_h * 100, 2) if img_h else 0,
|
||||||
},
|
},
|
||||||
'ocr_engine': 'cell_crop_v2',
|
'ocr_engine': 'cell_crop_v2',
|
||||||
|
'is_bold': False,
|
||||||
}
|
}
|
||||||
|
|
||||||
if cw <= 0 or ch <= 0:
|
if cw <= 0 or ch <= 0:
|
||||||
@@ -4955,10 +5006,17 @@ def _ocr_cell_crop(
|
|||||||
row_idx, col_idx, pre_filter)
|
row_idx, col_idx, pre_filter)
|
||||||
avg_conf = 0.0
|
avg_conf = 0.0
|
||||||
|
|
||||||
|
# --- Bold detection via stroke-width analysis ---
|
||||||
|
is_bold = False
|
||||||
|
if text.strip() and ocr_img is not None:
|
||||||
|
gray_cell = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||||
|
is_bold = _detect_bold(gray_cell)
|
||||||
|
|
||||||
result = dict(empty_cell)
|
result = dict(empty_cell)
|
||||||
result['text'] = text
|
result['text'] = text
|
||||||
result['confidence'] = avg_conf
|
result['confidence'] = avg_conf
|
||||||
result['ocr_engine'] = used_engine
|
result['ocr_engine'] = used_engine
|
||||||
|
result['is_bold'] = is_bold
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@@ -5105,6 +5163,16 @@ def build_cell_grid_v2(
|
|||||||
# Apply noise filter
|
# Apply noise filter
|
||||||
text = _clean_cell_text(text)
|
text = _clean_cell_text(text)
|
||||||
|
|
||||||
|
# Bold detection for broad columns
|
||||||
|
is_bold = False
|
||||||
|
if text.strip() and ocr_img is not None:
|
||||||
|
bc_y = max(0, row.y)
|
||||||
|
bc_h = min(img_h, row.y + row.height) - bc_y
|
||||||
|
bc_x = max(0, col.x)
|
||||||
|
bc_w = min(img_w, col.x + col.width) - bc_x
|
||||||
|
if bc_h > 0 and bc_w > 0:
|
||||||
|
is_bold = _detect_bold(ocr_img[bc_y:bc_y + bc_h, bc_x:bc_x + bc_w])
|
||||||
|
|
||||||
cell = {
|
cell = {
|
||||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||||
'row_index': row_idx,
|
'row_index': row_idx,
|
||||||
@@ -5123,6 +5191,7 @@ def build_cell_grid_v2(
|
|||||||
'h': round(row.height / img_h * 100, 2) if img_h else 0,
|
'h': round(row.height / img_h * 100, 2) if img_h else 0,
|
||||||
},
|
},
|
||||||
'ocr_engine': 'word_lookup',
|
'ocr_engine': 'word_lookup',
|
||||||
|
'is_bold': is_bold,
|
||||||
}
|
}
|
||||||
cells.append(cell)
|
cells.append(cell)
|
||||||
|
|
||||||
@@ -6960,8 +7029,19 @@ def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
|
|||||||
if not candidate[0].isdigit():
|
if not candidate[0].isdigit():
|
||||||
return candidate
|
return candidate
|
||||||
|
|
||||||
# 3. General spell correction for unknown words (no digits/pipes)
|
# 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
|
||||||
# e.g. "iberqueren" → "ueberqueren", "beautful" → "beautiful"
|
# Try single-char umlaut substitutions and check against dictionary.
|
||||||
|
if len(token) >= 3 and token.isalpha() and field == "german":
|
||||||
|
_UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
|
||||||
|
'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
|
||||||
|
for i, ch in enumerate(token):
|
||||||
|
if ch in _UMLAUT_SUBS:
|
||||||
|
candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
|
||||||
|
if _spell_dict_knows(candidate):
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
# 4. General spell correction for unknown words (no digits/pipes)
|
||||||
|
# e.g. "beautful" → "beautiful"
|
||||||
if not has_suspicious and len(token) >= 3 and token.isalpha():
|
if not has_suspicious and len(token) >= 3 and token.isalpha():
|
||||||
spell = _en_spell if field == "english" else _de_spell if field == "german" else None
|
spell = _en_spell if field == "english" else _de_spell if field == "german" else None
|
||||||
if spell is not None:
|
if spell is not None:
|
||||||
|
|||||||
Reference in New Issue
Block a user