From 770aea611fcc151c0258862185b5a59d2700a109 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 5 Mar 2026 13:15:59 +0100
Subject: [PATCH] fix: correct example field (fixes iberqueren), disable
 cell-level bold
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add "example" to spell correction loop — was only correcting
  "english" and "german" fields, missing umlauts in example sentences
- Use "german" language for example field (mixed-language, umlauts needed)
- Disable cell-level bold detection — cannot distinguish bold from
  non-bold in mixed-format cells (e.g. "cookie ['kuki]")
- Keep _measure_stroke_width and _classify_bold_cells for future
  word-level bold detection

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 966f47e..27f7e57 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -5250,13 +5250,14 @@ def build_cell_grid_v2(
     if empty_rows_removed > 0:
         logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
 
-    # --- Page-level bold detection: compare stroke widths across all cells ---
-    _classify_bold_cells(cells, ocr_img, img_w, img_h)
-    bold_count = sum(1 for c in cells if c.get('is_bold'))
+    # Bold detection disabled: cell-level stroke-width analysis cannot
+    # distinguish bold from non-bold when cells contain mixed formatting
+    # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
+    # TODO: word-level bold detection would require per-word bounding boxes.
 
     logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
                 f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
-                f"engine={engine_name} (hybrid), {bold_count} bold")
+                f"engine={engine_name} (hybrid)")
 
     return cells, columns_meta
 
@@ -7132,11 +7133,13 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
         if not _entry_needs_review(e):
             all_corrected.append(e)
             continue
-        for field_name in ("english", "german"):
+        for field_name in ("english", "german", "example"):
             old_val = (e.get(field_name) or "").strip()
             if not old_val:
                 continue
-            new_val, was_changed = _spell_fix_field(old_val, field=field_name)
+            # example field is mixed-language — try German first (for umlauts)
+            lang = "german" if field_name in ("german", "example") else "english"
+            new_val, was_changed = _spell_fix_field(old_val, field=lang)
             if was_changed and new_val != old_val:
                 changes.append({
                     "row_index": e.get("row_index", i),