From cd12755da64f53c4695f74ac39faaf085f1ab243 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 5 Mar 2026 12:06:57 +0100
Subject: [PATCH] feat: OCR umlaut confusion correction + bold detection via
 stroke-width
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add umlaut confusion rules (i→ü, a→ä, o→ö, u→ü) to _spell_fix_token
  for German text — fixes "iberqueren" → "überqueren" etc.
- Add _detect_bold() using OpenCV stroke-width analysis on cell crops
- Integrate bold detection in both narrow (cell-crop) and broad (word-lookup) paths
- Add is_bold field to GridCell TypeScript interface
- Render bold text in StepGroundTruth reconstruction view

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../app/(admin)/ai/ocr-pipeline/types.ts      |  1 +
 .../ocr-pipeline/StepGroundTruth.tsx          |  1 +
 klausur-service/backend/cv_vocab_pipeline.py  | 84 ++++++++++++++++++-
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts
index 849d589..a7ba6e0 100644
--- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts
+++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts
@@ -184,6 +184,7 @@ export interface GridCell {
   bbox_px: WordBbox
   bbox_pct: WordBbox
   ocr_engine?: string
+  is_bold?: boolean
   status?: 'pending' | 'confirmed' | 'edited' | 'skipped'
 }
 
diff --git a/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx b/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx
index 93d2473..6ec6ebe 100644
--- a/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepGroundTruth.tsx
@@ -389,6 +389,7 @@ export function StepGroundTruth({ sessionId, onNext }: StepGroundTruthProps) {
                         height: `${cell.bbox_pct.h}%`,
                         color: '#1a1a1a',
                         fontSize: `${fontSize}px`,
+                        fontWeight: cell.is_bold ? 'bold' : 'normal',
                         fontFamily: "'Liberation Sans', 'DejaVu Sans', Arial, sans-serif",
                         display: 'flex',
                         alignItems: 'center',
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 0e182f7..6ac2650 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -4767,6 +4767,56 @@ def _clean_cell_text_lite(text: str) -> str:
     return stripped
 
 
+# ---------------------------------------------------------------------------
+# Bold detection via stroke-width analysis
+# ---------------------------------------------------------------------------
+
+def _detect_bold(gray_crop: np.ndarray) -> bool:
+    """Detect bold text by measuring mean stroke width in a binarised cell crop.
+
+    Bold text has thicker strokes.  We binarise (Otsu), skeletonise to get
+    single-pixel strokes, then compute mean distance-transform value on the
+    skeleton — that approximates half the stroke width.  A value above the
+    threshold indicates bold.
+
+    Returns True if the crop likely contains bold text.
+    """
+    if gray_crop is None or gray_crop.size == 0:
+        return False
+    h, w = gray_crop.shape[:2]
+    if h < 10 or w < 10:
+        return False
+
+    # Binarise: text = white (255), background = black (0)
+    _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+    if cv2.countNonZero(bw) < 20:
+        return False
+
+    # Distance transform: value at each white pixel = distance to nearest black
+    dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
+
+    # Skeleton via morphological thinning (approximate with erode-based approach)
+    # Use thin iterations of erosion to approximate the medial axis
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    thin = bw.copy()
+    for _ in range(max(1, min(h, w) // 6)):
+        eroded = cv2.erode(thin, kernel)
+        if cv2.countNonZero(eroded) < 5:
+            break
+        thin = eroded
+
+    # Mean distance-transform value on the skeleton points
+    skeleton_pts = thin > 0
+    if not np.any(skeleton_pts):
+        return False
+    mean_stroke = float(np.mean(dist[skeleton_pts]))
+
+    # Threshold: empirically, normal text ≈ 1.0–1.8, bold ≈ 2.0+
+    # Scale by crop height to be DPI-independent
+    normalised = mean_stroke / max(h, 1) * 100  # % of cell height
+    return normalised > 3.5
+
+
 # ---------------------------------------------------------------------------
 # Cell-First OCR (v2) — each cell cropped and OCR'd in isolation
 # ---------------------------------------------------------------------------
@@ -4821,6 +4871,7 @@ def _ocr_cell_crop(
             'h': round(disp_h / img_h * 100, 2) if img_h else 0,
         },
         'ocr_engine': 'cell_crop_v2',
+        'is_bold': False,
     }
 
     if cw <= 0 or ch <= 0:
@@ -4955,10 +5006,17 @@ def _ocr_cell_crop(
                         row_idx, col_idx, pre_filter)
             avg_conf = 0.0
 
+    # --- Bold detection via stroke-width analysis ---
+    is_bold = False
+    if text.strip() and ocr_img is not None:
+        gray_cell = ocr_img[cy:cy + ch, cx:cx + cw]
+        is_bold = _detect_bold(gray_cell)
+
     result = dict(empty_cell)
     result['text'] = text
     result['confidence'] = avg_conf
     result['ocr_engine'] = used_engine
+    result['is_bold'] = is_bold
     return result
 
 
@@ -5105,6 +5163,16 @@ def build_cell_grid_v2(
                 # Apply noise filter
                 text = _clean_cell_text(text)
 
+                # Bold detection for broad columns
+                is_bold = False
+                if text.strip() and ocr_img is not None:
+                    bc_y = max(0, row.y)
+                    bc_h = min(img_h, row.y + row.height) - bc_y
+                    bc_x = max(0, col.x)
+                    bc_w = min(img_w, col.x + col.width) - bc_x
+                    if bc_h > 0 and bc_w > 0:
+                        is_bold = _detect_bold(ocr_img[bc_y:bc_y + bc_h, bc_x:bc_x + bc_w])
+
                 cell = {
                     'cell_id': f"R{row_idx:02d}_C{col_idx}",
                     'row_index': row_idx,
@@ -5123,6 +5191,7 @@ def build_cell_grid_v2(
                         'h': round(row.height / img_h * 100, 2) if img_h else 0,
                     },
                     'ocr_engine': 'word_lookup',
+                    'is_bold': is_bold,
                 }
                 cells.append(cell)
 
@@ -6960,8 +7029,19 @@ def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
                 if not candidate[0].isdigit():
                     return candidate
 
-    # 3. General spell correction for unknown words (no digits/pipes)
-    #    e.g. "iberqueren" → "ueberqueren", "beautful" → "beautiful"
+    # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
+    #    Try single-char umlaut substitutions and check against dictionary.
+    if len(token) >= 3 and token.isalpha() and field == "german":
+        _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
+                         'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
+        for i, ch in enumerate(token):
+            if ch in _UMLAUT_SUBS:
+                candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+
+    # 4. General spell correction for unknown words (no digits/pipes)
+    #    e.g. "beautful" → "beautiful"
     if not has_suspicious and len(token) >= 3 and token.isalpha():
         spell = _en_spell if field == "english" else _de_spell if field == "german" else None
         if spell is not None: