From 010616be5a711d1b08db84f5f7400e6090f8ff8e Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Sat, 28 Feb 2026 21:24:28 +0100
Subject: [PATCH] fix(ocr-pipeline): generic example attachment + cell padding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Semantic example matching: instead of attaching example sentences
   to the immediately preceding entry, find the vocab entry whose
   English word(s) appear in the example. "a broken arm" → matches
   "broken" via word overlap, not "egg/Ei". Uses stem matching for
   word form variants (break/broken share stem "bro").

2. Cell padding: add 8px padding to each cell region so words at
   column/row edges don't get clipped by OCR (fixes "er wollte"
   missing at cell boundaries).

3. Treat very short DE text (≤2 chars) as OCR noise, not real
   translation — prevents false positives in example detection.

All fixes are generic and deterministic.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 123 +++++++++++++------
 1 file changed, 87 insertions(+), 36 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 40d658c..0cd7cbe 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -2477,65 +2477,114 @@ def _split_by_comma(text: str) -> List[str]:
 
 # --- C. Example Sentence Attachment ---
 
+def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
+    """Find the vocab entry whose English word(s) best match the example sentence.
+
+    Returns index into vocab_entries, or -1 if no match found.
+    Uses word stem overlap: "a broken arm" matches "broken" or "break".
+    """
+    if not vocab_entries or not example_text:
+        return -1
+
+    example_lower = example_text.lower()
+    example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
+
+    best_idx = -1
+    best_score = 0
+
+    for i, entry in enumerate(vocab_entries):
+        en = (entry.get('english', '') or '').lower()
+        if not en:
+            continue
+
+        # Extract vocab words (split on space, comma, newline)
+        vocab_words = set(re.findall(r'[a-zäöüß]+', en))
+
+        # Score: how many vocab words appear in the example?
+        # Also check if example words share a common stem (first 4 chars)
+        direct_matches = vocab_words & example_words
+        score = len(direct_matches) * 10
+
+        # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
+        if score == 0:
+            for vw in vocab_words:
+                if len(vw) < 3:
+                    continue
+                stem = vw[:4] if len(vw) >= 4 else vw[:3]
+                for ew in example_words:
+                    if len(ew) >= len(stem) and ew[:len(stem)] == stem:
+                        score += 5
+                        break
+
+        if score > best_score:
+            best_score = score
+            best_idx = i
+
+    return best_idx if best_score > 0 else -1
+
+
 def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Attach rows with EN text but no DE translation as examples to the preceding entry.
+    """Attach rows with EN text but no DE translation as examples to matching vocab entries.
 
     Vocabulary worksheets often have:
-      Row 1: break / brechen
-      Row 2: a broken arm        (no DE → this is an example for "break")
-      Row 3: a broken plate       (no DE → another example)
-      Row 4: egg / Ei             (has DE → new vocab entry)
+      Row 1: break, broke, broken / brechen, brach, gebrochen
+      Row 2: a broken arm          (no DE → example for "broken")
+      Row 3: a broken plate         (no DE → example for "broken")
+      Row 4: egg / Ei               (has DE → new vocab entry)
 
-    Rules (deterministic):
-    - A row is an "example row" if it has EN text but NO DE text
-    - It gets attached to the nearest preceding entry that HAS DE text
+    Rules (deterministic, generic):
+    - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
+    - Find the best matching vocab entry by checking which entry's English words
+      appear in the example sentence (semantic matching via word overlap)
+    - Fall back to the nearest preceding entry if no word match found
     - Multiple examples get joined with " | "
     """
     if not entries:
         return entries
 
-    result: List[Dict[str, Any]] = []
-    pending_examples: List[str] = []
+    # Separate into vocab entries (have DE) and example candidates (no DE)
+    vocab_entries: List[Dict[str, Any]] = []
+    examples_for: Dict[int, List[str]] = {}  # vocab_index → list of example texts
 
     for entry in entries:
         en = (entry.get('english', '') or '').strip()
         de = (entry.get('german', '') or '').strip()
         ex = (entry.get('example', '') or '').strip()
 
-        has_de = bool(de)
+        # Treat very short DE (≤2 chars) as OCR noise, not real translation
+        has_de = len(de) > 2
         has_en = bool(en)
 
-        if has_en and not has_de and result:
-            # This is an example sentence — attach to last vocab entry
+        if has_en and not has_de and vocab_entries:
+            # This is an example sentence — find best matching vocab entry
             example_text = en
             if ex:
                 example_text = f"{en} — {ex}"
-            pending_examples.append(example_text)
-            continue
 
-        # This is a real vocab entry
-        # First, flush any pending examples to the previous entry
-        if pending_examples and result:
-            prev = result[-1]
-            existing_ex = (prev.get('example', '') or '').strip()
-            new_examples = ' | '.join(pending_examples)
-            prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
-            pending_examples = []
+            match_idx = _find_best_vocab_match(en, vocab_entries)
+            if match_idx < 0:
+                # No word match → fall back to last entry
+                match_idx = len(vocab_entries) - 1
 
-        result.append(entry)
+            if match_idx not in examples_for:
+                examples_for[match_idx] = []
+            examples_for[match_idx].append(example_text)
+        else:
+            vocab_entries.append(entry)
 
-    # Flush remaining examples
-    if pending_examples and result:
-        prev = result[-1]
-        existing_ex = (prev.get('example', '') or '').strip()
-        new_examples = ' | '.join(pending_examples)
-        prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
+    # Attach examples to their matched vocab entries
+    for idx, example_list in examples_for.items():
+        if 0 <= idx < len(vocab_entries):
+            entry = vocab_entries[idx]
+            existing_ex = (entry.get('example', '') or '').strip()
+            new_examples = ' | '.join(example_list)
+            entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
 
     # Re-number
-    for i, e in enumerate(result):
+    for i, e in enumerate(vocab_entries):
         e['row_index'] = i
 
-    return result
+    return vocab_entries
 
 
 # --- D. Phonetic Bracket IPA Replacement ---
@@ -2794,10 +2843,12 @@ def build_word_grid(
 
         for col in relevant_cols:
             # Compute cell region: column x/width, row y/height
-            cell_x = col.x
-            cell_y = row.y
-            cell_w = col.width
-            cell_h = row.height
+            # Add padding to avoid clipping edge words
+            pad = 8  # pixels
+            cell_x = col.x - pad
+            cell_y = row.y - pad
+            cell_w = col.width + 2 * pad
+            cell_h = row.height + 2 * pad
 
             # Clamp to image bounds
             cell_x = max(0, cell_x)