fix(ocr-pipeline): generic example attachment + cell padding

1. Semantic example matching: instead of attaching example sentences to the immediately preceding entry, find the vocab entry whose English word(s) appear in the example. "a broken arm" → matches "broken" via word overlap, not "egg/Ei". Uses stem matching for word form variants (break/broken share stem "bro"). 2. Cell padding: add 8px padding to each cell region so words at column/row edges don't get clipped by OCR (fixes "er wollte" missing at cell boundaries). 3. Treat very short DE text (≤2 chars) as OCR noise, not real translation — prevents false positives in example detection. All fixes are generic and deterministic. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 21:24:28 +01:00
parent e3aa8e899e
commit 010616be5a
1 changed files with 87 additions and 36 deletions
@@ -2477,65 +2477,114 @@ def _split_by_comma(text: str) -> List[str]:
 # --- C. Example Sentence Attachment ---
 def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
    """Find the vocab entry whose English word(s) best match the example sentence.
    Returns index into vocab_entries, or -1 if no match found.
    Uses word stem overlap: "a broken arm" matches "broken" or "break".
    """
    if not vocab_entries or not example_text:
        return -1
    example_lower = example_text.lower()
    example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
    best_idx = -1
    best_score = 0
    for i, entry in enumerate(vocab_entries):
        en = (entry.get('english', '') or '').lower()
        if not en:
            continue
        # Extract vocab words (split on space, comma, newline)
        vocab_words = set(re.findall(r'[a-zäöüß]+', en))
        # Score: how many vocab words appear in the example?
        # Also check if example words share a common stem (first 4 chars)
        direct_matches = vocab_words & example_words
        score = len(direct_matches) * 10
        # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
        if score == 0:
            for vw in vocab_words:
                if len(vw) < 3:
                    continue
                stem = vw[:4] if len(vw) >= 4 else vw[:3]
                for ew in example_words:
                    if len(ew) >= len(stem) and ew[:len(stem)] == stem:
                        score += 5
                        break
        if score > best_score:
            best_score = score
            best_idx = i
    return best_idx if best_score > 0 else -1
 def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Attach rows with EN text but no DE translation as examples to the preceding entry.
+    """Attach rows with EN text but no DE translation as examples to matching vocab entries.
    Vocabulary worksheets often have:
-      Row 1: break / brechen
+      Row 1: break, broke, broken / brechen, brach, gebrochen
-      Row 2: a broken arm        (no DE → this is an example for "break")
+      Row 2: a broken arm          (no DE → example for "broken")
-      Row 3: a broken plate       (no DE → another example)
+      Row 3: a broken plate         (no DE → example for "broken")
-      Row 4: egg / Ei             (has DE → new vocab entry)
+      Row 4: egg / Ei               (has DE → new vocab entry)
-    Rules (deterministic):
+    Rules (deterministic, generic):
-    - A row is an "example row" if it has EN text but NO DE text
+    - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
-    - It gets attached to the nearest preceding entry that HAS DE text
+    - Find the best matching vocab entry by checking which entry's English words
      appear in the example sentence (semantic matching via word overlap)
    - Fall back to the nearest preceding entry if no word match found
    - Multiple examples get joined with " | "
    """
    if not entries:
        return entries
-    result: List[Dict[str, Any]] = []
+    # Separate into vocab entries (have DE) and example candidates (no DE)
-    pending_examples: List[str] = []
+    vocab_entries: List[Dict[str, Any]] = []
    examples_for: Dict[int, List[str]] = {}  # vocab_index → list of example texts
    for entry in entries:
        en = (entry.get('english', '') or '').strip()
        de = (entry.get('german', '') or '').strip()
        ex = (entry.get('example', '') or '').strip()
-        has_de = bool(de)
+        # Treat very short DE (≤2 chars) as OCR noise, not real translation
        has_de = len(de) > 2
        has_en = bool(en)
-        if has_en and not has_de and result:
+        if has_en and not has_de and vocab_entries:
-            # This is an example sentence — attach to last vocab entry
+            # This is an example sentence — find best matching vocab entry
            example_text = en
            if ex:
                example_text = f"{en} — {ex}"
            pending_examples.append(example_text)
            continue
-        # This is a real vocab entry
+            match_idx = _find_best_vocab_match(en, vocab_entries)
-        # First, flush any pending examples to the previous entry
+            if match_idx < 0:
-        if pending_examples and result:
+                # No word match → fall back to last entry
-            prev = result[-1]
+                match_idx = len(vocab_entries) - 1
            existing_ex = (prev.get('example', '') or '').strip()
            new_examples = ' | '.join(pending_examples)
            prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
            pending_examples = []
-        result.append(entry)
+            if match_idx not in examples_for:
                examples_for[match_idx] = []
            examples_for[match_idx].append(example_text)
        else:
            vocab_entries.append(entry)
-    # Flush remaining examples
+    # Attach examples to their matched vocab entries
-    if pending_examples and result:
+    for idx, example_list in examples_for.items():
-        prev = result[-1]
+        if 0 <= idx < len(vocab_entries):
-        existing_ex = (prev.get('example', '') or '').strip()
+            entry = vocab_entries[idx]
-        new_examples = ' | '.join(pending_examples)
+            existing_ex = (entry.get('example', '') or '').strip()
-        prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
+            new_examples = ' | '.join(example_list)
            entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
    # Re-number
-    for i, e in enumerate(result):
+    for i, e in enumerate(vocab_entries):
        e['row_index'] = i
-    return result
+    return vocab_entries
 # --- D. Phonetic Bracket IPA Replacement ---
@@ -2794,10 +2843,12 @@ def build_word_grid(
        for col in relevant_cols:
            # Compute cell region: column x/width, row y/height
-            cell_x = col.x
+            # Add padding to avoid clipping edge words
-            cell_y = row.y
+            pad = 8  # pixels
-            cell_w = col.width
+            cell_x = col.x - pad
-            cell_h = row.height
+            cell_y = row.y - pad
            cell_w = col.width + 2 * pad
            cell_h = row.height + 2 * pad
            # Clamp to image bounds
            cell_x = max(0, cell_x)