fix(ocr-pipeline): generic example attachment + cell padding

1. Semantic example matching: instead of attaching example sentences to the immediately preceding entry, find the vocab entry whose English word(s) appear in the example. "a broken arm" → matches "broken" via word overlap, not "egg/Ei". Uses stem matching for word form variants (break/broken share stem "bro"). 2. Cell padding: add 8px padding to each cell region so words at column/row edges don't get clipped by OCR (fixes "er wollte" missing at cell boundaries). 3. Treat very short DE text (≤2 chars) as OCR noise, not real translation — prevents false positives in example detection. All fixes are generic and deterministic. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 21:24:28 +01:00
parent e3aa8e899e
commit 010616be5a
1 changed files with 87 additions and 36 deletions
@@ -2477,65 +2477,114 @@ def _split_by_comma(text: str) -> List[str]:

 # --- C. Example Sentence Attachment ---

+def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
+    """Find the vocab entry whose English word(s) best match the example sentence.
+
+    Returns index into vocab_entries, or -1 if no match found.
+    Uses word stem overlap: "a broken arm" matches "broken" or "break".
+    """
+    if not vocab_entries or not example_text:
+        return -1
+
+    example_lower = example_text.lower()
+    example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
+
+    best_idx = -1
+    best_score = 0
+
+    for i, entry in enumerate(vocab_entries):
+        en = (entry.get('english', '') or '').lower()
+        if not en:
+            continue
+
+        # Extract vocab words (split on space, comma, newline)
+        vocab_words = set(re.findall(r'[a-zäöüß]+', en))
+
+        # Score: how many vocab words appear in the example?
+        # Also check if example words share a common stem (first 4 chars)
+        direct_matches = vocab_words & example_words
+        score = len(direct_matches) * 10
+
+        # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
+        if score == 0:
+            for vw in vocab_words:
+                if len(vw) < 3:
+                    continue
+                stem = vw[:4] if len(vw) >= 4 else vw[:3]
+                for ew in example_words:
+                    if len(ew) >= len(stem) and ew[:len(stem)] == stem:
+                        score += 5
+                        break
+
+        if score > best_score:
+            best_score = score
+            best_idx = i
+
+    return best_idx if best_score > 0 else -1
+
+
 def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Attach rows with EN text but no DE translation as examples to the preceding entry.
+    """Attach rows with EN text but no DE translation as examples to matching vocab entries.

    Vocabulary worksheets often have:
-      Row 1: break / brechen
-      Row 2: a broken arm        (no DE → this is an example for "break")
-      Row 3: a broken plate       (no DE → another example)
-      Row 4: egg / Ei             (has DE → new vocab entry)
+      Row 1: break, broke, broken / brechen, brach, gebrochen
+      Row 2: a broken arm          (no DE → example for "broken")
+      Row 3: a broken plate         (no DE → example for "broken")
+      Row 4: egg / Ei               (has DE → new vocab entry)

-    Rules (deterministic):
-    - A row is an "example row" if it has EN text but NO DE text
-    - It gets attached to the nearest preceding entry that HAS DE text
+    Rules (deterministic, generic):
+    - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
+    - Find the best matching vocab entry by checking which entry's English words
+      appear in the example sentence (semantic matching via word overlap)
+    - Fall back to the nearest preceding entry if no word match found
    - Multiple examples get joined with " | "
    """
    if not entries:
        return entries

-    result: List[Dict[str, Any]] = []
-    pending_examples: List[str] = []
+    # Separate into vocab entries (have DE) and example candidates (no DE)
+    vocab_entries: List[Dict[str, Any]] = []
+    examples_for: Dict[int, List[str]] = {}  # vocab_index → list of example texts

    for entry in entries:
        en = (entry.get('english', '') or '').strip()
        de = (entry.get('german', '') or '').strip()
        ex = (entry.get('example', '') or '').strip()

-        has_de = bool(de)
+        # Treat very short DE (≤2 chars) as OCR noise, not real translation
+        has_de = len(de) > 2
        has_en = bool(en)

-        if has_en and not has_de and result:
-            # This is an example sentence — attach to last vocab entry
+        if has_en and not has_de and vocab_entries:
+            # This is an example sentence — find best matching vocab entry
            example_text = en
            if ex:
                example_text = f"{en} — {ex}"
-            pending_examples.append(example_text)
-            continue

-        # This is a real vocab entry
-        # First, flush any pending examples to the previous entry
-        if pending_examples and result:
-            prev = result[-1]
-            existing_ex = (prev.get('example', '') or '').strip()
-            new_examples = ' | '.join(pending_examples)
-            prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
-            pending_examples = []
+            match_idx = _find_best_vocab_match(en, vocab_entries)
+            if match_idx < 0:
+                # No word match → fall back to last entry
+                match_idx = len(vocab_entries) - 1

-        result.append(entry)
+            if match_idx not in examples_for:
+                examples_for[match_idx] = []
+            examples_for[match_idx].append(example_text)
+        else:
+            vocab_entries.append(entry)

-    # Flush remaining examples
-    if pending_examples and result:
-        prev = result[-1]
-        existing_ex = (prev.get('example', '') or '').strip()
-        new_examples = ' | '.join(pending_examples)
-        prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
+    # Attach examples to their matched vocab entries
+    for idx, example_list in examples_for.items():
+        if 0 <= idx < len(vocab_entries):
+            entry = vocab_entries[idx]
+            existing_ex = (entry.get('example', '') or '').strip()
+            new_examples = ' | '.join(example_list)
+            entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples

    # Re-number
-    for i, e in enumerate(result):
+    for i, e in enumerate(vocab_entries):
        e['row_index'] = i

-    return result
+    return vocab_entries


 # --- D. Phonetic Bracket IPA Replacement ---
@@ -2794,10 +2843,12 @@ def build_word_grid(

        for col in relevant_cols:
            # Compute cell region: column x/width, row y/height
-            cell_x = col.x
-            cell_y = row.y
-            cell_w = col.width
-            cell_h = row.height
+            # Add padding to avoid clipping edge words
+            pad = 8  # pixels
+            cell_x = col.x - pad
+            cell_y = row.y - pad
+            cell_w = col.width + 2 * pad
+            cell_h = row.height + 2 * pad

            # Clamp to image bounds
            cell_x = max(0, cell_x)