fix(ocr-pipeline): generic example attachment + cell padding

1. Semantic example matching: instead of attaching example sentences
   to the immediately preceding entry, find the vocab entry whose
   English word(s) appear in the example. "a broken arm" → matches
   "broken" via word overlap, not "egg/Ei". Uses stem matching for
   word form variants (break/broken share stem "bro").

2. Cell padding: add 8px padding to each cell region so words at
   column/row edges don't get clipped by OCR (fixes "er wollte"
   missing at cell boundaries).

3. Treat very short DE text (≤2 chars) as OCR noise, not real
   translation — prevents false positives in example detection.

All fixes are generic and deterministic.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 21:24:28 +01:00
parent e3aa8e899e
commit 010616be5a

View File

@@ -2477,65 +2477,114 @@ def _split_by_comma(text: str) -> List[str]:
# --- C. Example Sentence Attachment --- # --- C. Example Sentence Attachment ---
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
"""Find the vocab entry whose English word(s) best match the example sentence.
Returns index into vocab_entries, or -1 if no match found.
Uses word stem overlap: "a broken arm" matches "broken" or "break".
"""
if not vocab_entries or not example_text:
return -1
example_lower = example_text.lower()
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
best_idx = -1
best_score = 0
for i, entry in enumerate(vocab_entries):
en = (entry.get('english', '') or '').lower()
if not en:
continue
# Extract vocab words (split on space, comma, newline)
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
# Score: how many vocab words appear in the example?
# Also check if example words share a common stem (first 4 chars)
direct_matches = vocab_words & example_words
score = len(direct_matches) * 10
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
if score == 0:
for vw in vocab_words:
if len(vw) < 3:
continue
stem = vw[:4] if len(vw) >= 4 else vw[:3]
for ew in example_words:
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
score += 5
break
if score > best_score:
best_score = score
best_idx = i
return best_idx if best_score > 0 else -1
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Attach rows with EN text but no DE translation as examples to the preceding entry. """Attach rows with EN text but no DE translation as examples to matching vocab entries.
Vocabulary worksheets often have: Vocabulary worksheets often have:
Row 1: break / brechen Row 1: break, broke, broken / brechen, brach, gebrochen
Row 2: a broken arm (no DE → this is an example for "break") Row 2: a broken arm (no DE → example for "broken")
Row 3: a broken plate (no DE → another example) Row 3: a broken plate (no DE → example for "broken")
Row 4: egg / Ei (has DE → new vocab entry) Row 4: egg / Ei (has DE → new vocab entry)
Rules (deterministic): Rules (deterministic, generic):
- A row is an "example row" if it has EN text but NO DE text - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
- It gets attached to the nearest preceding entry that HAS DE text - Find the best matching vocab entry by checking which entry's English words
appear in the example sentence (semantic matching via word overlap)
- Fall back to the nearest preceding entry if no word match found
- Multiple examples get joined with " | " - Multiple examples get joined with " | "
""" """
if not entries: if not entries:
return entries return entries
result: List[Dict[str, Any]] = [] # Separate into vocab entries (have DE) and example candidates (no DE)
pending_examples: List[str] = [] vocab_entries: List[Dict[str, Any]] = []
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
for entry in entries: for entry in entries:
en = (entry.get('english', '') or '').strip() en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip() de = (entry.get('german', '') or '').strip()
ex = (entry.get('example', '') or '').strip() ex = (entry.get('example', '') or '').strip()
has_de = bool(de) # Treat very short DE (≤2 chars) as OCR noise, not real translation
has_de = len(de) > 2
has_en = bool(en) has_en = bool(en)
if has_en and not has_de and result: if has_en and not has_de and vocab_entries:
# This is an example sentence — attach to last vocab entry # This is an example sentence — find best matching vocab entry
example_text = en example_text = en
if ex: if ex:
example_text = f"{en}{ex}" example_text = f"{en}{ex}"
pending_examples.append(example_text)
continue
# This is a real vocab entry match_idx = _find_best_vocab_match(en, vocab_entries)
# First, flush any pending examples to the previous entry if match_idx < 0:
if pending_examples and result: # No word match → fall back to last entry
prev = result[-1] match_idx = len(vocab_entries) - 1
existing_ex = (prev.get('example', '') or '').strip()
new_examples = ' | '.join(pending_examples)
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
pending_examples = []
result.append(entry) if match_idx not in examples_for:
examples_for[match_idx] = []
examples_for[match_idx].append(example_text)
else:
vocab_entries.append(entry)
# Flush remaining examples # Attach examples to their matched vocab entries
if pending_examples and result: for idx, example_list in examples_for.items():
prev = result[-1] if 0 <= idx < len(vocab_entries):
existing_ex = (prev.get('example', '') or '').strip() entry = vocab_entries[idx]
new_examples = ' | '.join(pending_examples) existing_ex = (entry.get('example', '') or '').strip()
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples new_examples = ' | '.join(example_list)
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
# Re-number # Re-number
for i, e in enumerate(result): for i, e in enumerate(vocab_entries):
e['row_index'] = i e['row_index'] = i
return result return vocab_entries
# --- D. Phonetic Bracket IPA Replacement --- # --- D. Phonetic Bracket IPA Replacement ---
@@ -2794,10 +2843,12 @@ def build_word_grid(
for col in relevant_cols: for col in relevant_cols:
# Compute cell region: column x/width, row y/height # Compute cell region: column x/width, row y/height
cell_x = col.x # Add padding to avoid clipping edge words
cell_y = row.y pad = 8 # pixels
cell_w = col.width cell_x = col.x - pad
cell_h = row.height cell_y = row.y - pad
cell_w = col.width + 2 * pad
cell_h = row.height + 2 * pad
# Clamp to image bounds # Clamp to image bounds
cell_x = max(0, cell_x) cell_x = max(0, cell_x)