From cd12755da64f53c4695f74ac39faaf085f1ab243 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 5 Mar 2026 12:06:57 +0100 Subject: [PATCH] feat: OCR umlaut confusion correction + bold detection via stroke-width MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add umlaut confusion rules (i→ü, a→ä, o→ö, u→ü) to _spell_fix_token for German text — fixes "iberqueren" → "überqueren" etc. - Add _detect_bold() using OpenCV stroke-width analysis on cell crops - Integrate bold detection in both narrow (cell-crop) and broad (word-lookup) paths - Add is_bold field to GridCell TypeScript interface - Render bold text in StepGroundTruth reconstruction view Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-pipeline/types.ts | 1 + .../ocr-pipeline/StepGroundTruth.tsx | 1 + klausur-service/backend/cv_vocab_pipeline.py | 84 ++++++++++++++++++- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index 849d589..a7ba6e0 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -184,6 +184,7 @@ export interface GridCell { bbox_px: WordBbox bbox_pct: WordBbox ocr_engine?: string + is_bold?: boolean status?: 'pending' | 'confirmed' | 'edited' | 'skipped' } diff --git a/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx b/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx index 93d2473..6ec6ebe 100644 --- a/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx @@ -389,6 +389,7 @@ export function StepGroundTruth({ sessionId, onNext }: StepGroundTruthProps) { height: `${cell.bbox_pct.h}%`, color: '#1a1a1a', fontSize: `${fontSize}px`, + fontWeight: cell.is_bold ? 'bold' : 'normal', fontFamily: "'Liberation Sans', 'DejaVu Sans', Arial, sans-serif", display: 'flex', alignItems: 'center', diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 0e182f7..6ac2650 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4767,6 +4767,56 @@ def _clean_cell_text_lite(text: str) -> str: return stripped +# --------------------------------------------------------------------------- +# Bold detection via stroke-width analysis +# --------------------------------------------------------------------------- + +def _detect_bold(gray_crop: np.ndarray) -> bool: + """Detect bold text by measuring mean stroke width in a binarised cell crop. + + Bold text has thicker strokes. We binarise (Otsu), skeletonise to get + single-pixel strokes, then compute mean distance-transform value on the + skeleton — that approximates half the stroke width. A value above the + threshold indicates bold. + + Returns True if the crop likely contains bold text. + """ + if gray_crop is None or gray_crop.size == 0: + return False + h, w = gray_crop.shape[:2] + if h < 10 or w < 10: + return False + + # Binarise: text = white (255), background = black (0) + _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) + if cv2.countNonZero(bw) < 20: + return False + + # Distance transform: value at each white pixel = distance to nearest black + dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3) + + # Skeleton via morphological thinning (approximate with erode-based approach) + # Use thin iterations of erosion to approximate the medial axis + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) + thin = bw.copy() + for _ in range(max(1, min(h, w) // 6)): + eroded = cv2.erode(thin, kernel) + if cv2.countNonZero(eroded) < 5: + break + thin = eroded + + # Mean distance-transform value on the skeleton points + skeleton_pts = thin > 0 + if not np.any(skeleton_pts): + return False + mean_stroke = float(np.mean(dist[skeleton_pts])) + + # Threshold: empirically, normal text ≈ 1.0–1.8, bold ≈ 2.0+ + # Scale by crop height to be DPI-independent + normalised = mean_stroke / max(h, 1) * 100 # % of cell height + return normalised > 3.5 + + # --------------------------------------------------------------------------- # Cell-First OCR (v2) — each cell cropped and OCR'd in isolation # --------------------------------------------------------------------------- @@ -4821,6 +4871,7 @@ def _ocr_cell_crop( 'h': round(disp_h / img_h * 100, 2) if img_h else 0, }, 'ocr_engine': 'cell_crop_v2', + 'is_bold': False, } if cw <= 0 or ch <= 0: @@ -4955,10 +5006,17 @@ def _ocr_cell_crop( row_idx, col_idx, pre_filter) avg_conf = 0.0 + # --- Bold detection via stroke-width analysis --- + is_bold = False + if text.strip() and ocr_img is not None: + gray_cell = ocr_img[cy:cy + ch, cx:cx + cw] + is_bold = _detect_bold(gray_cell) + result = dict(empty_cell) result['text'] = text result['confidence'] = avg_conf result['ocr_engine'] = used_engine + result['is_bold'] = is_bold return result @@ -5105,6 +5163,16 @@ def build_cell_grid_v2( # Apply noise filter text = _clean_cell_text(text) + # Bold detection for broad columns + is_bold = False + if text.strip() and ocr_img is not None: + bc_y = max(0, row.y) + bc_h = min(img_h, row.y + row.height) - bc_y + bc_x = max(0, col.x) + bc_w = min(img_w, col.x + col.width) - bc_x + if bc_h > 0 and bc_w > 0: + is_bold = _detect_bold(ocr_img[bc_y:bc_y + bc_h, bc_x:bc_x + bc_w]) + cell = { 'cell_id': f"R{row_idx:02d}_C{col_idx}", 'row_index': row_idx, @@ -5123,6 +5191,7 @@ def build_cell_grid_v2( 'h': round(row.height / img_h * 100, 2) if img_h else 0, }, 'ocr_engine': 'word_lookup', + 'is_bold': is_bold, } cells.append(cell) @@ -6960,8 +7029,19 @@ def _spell_fix_token(token: str, field: str = "") -> Optional[str]: if not candidate[0].isdigit(): return candidate - # 3. General spell correction for unknown words (no digits/pipes) - # e.g. "iberqueren" → "ueberqueren", "beautful" → "beautiful" + # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u) + # Try single-char umlaut substitutions and check against dictionary. + if len(token) >= 3 and token.isalpha() and field == "german": + _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü', + 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'} + for i, ch in enumerate(token): + if ch in _UMLAUT_SUBS: + candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:] + if _spell_dict_knows(candidate): + return candidate + + # 4. General spell correction for unknown words (no digits/pipes) + # e.g. "beautful" → "beautiful" if not has_suspicious and len(token) >= 3 and token.isalpha(): spell = _en_spell if field == "english" else _de_spell if field == "german" else None if spell is not None: