fix(ocr-pipeline): generic example attachment + cell padding

1. Semantic example matching: instead of attaching example sentences
   to the immediately preceding entry, find the vocab entry whose
   English word(s) appear in the example. "a broken arm" → matches
   "broken" via word overlap, not "egg/Ei". Uses stem matching for
   word form variants (break/broken share stem "bro").

2. Cell padding: add 8px padding to each cell region so words at
   column/row edges don't get clipped by OCR (fixes "er wollte"
   missing at cell boundaries).

3. Treat very short DE text (≤2 chars) as OCR noise, not real
   translation — prevents false positives in example detection.

All fixes are generic and deterministic.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 21:24:28 +01:00
parent e3aa8e899e
commit 010616be5a

View File

@@ -2477,65 +2477,114 @@ def _split_by_comma(text: str) -> List[str]:
# --- C. Example Sentence Attachment ---
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
"""Find the vocab entry whose English word(s) best match the example sentence.
Returns index into vocab_entries, or -1 if no match found.
Uses word stem overlap: "a broken arm" matches "broken" or "break".
"""
if not vocab_entries or not example_text:
return -1
example_lower = example_text.lower()
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
best_idx = -1
best_score = 0
for i, entry in enumerate(vocab_entries):
en = (entry.get('english', '') or '').lower()
if not en:
continue
# Extract vocab words (split on space, comma, newline)
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
# Score: how many vocab words appear in the example?
# Also check if example words share a common stem (first 4 chars)
direct_matches = vocab_words & example_words
score = len(direct_matches) * 10
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
if score == 0:
for vw in vocab_words:
if len(vw) < 3:
continue
stem = vw[:4] if len(vw) >= 4 else vw[:3]
for ew in example_words:
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
score += 5
break
if score > best_score:
best_score = score
best_idx = i
return best_idx if best_score > 0 else -1
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Attach rows with EN text but no DE translation as examples to the preceding entry.
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
Vocabulary worksheets often have:
Row 1: break / brechen
Row 2: a broken arm (no DE → this is an example for "break")
Row 3: a broken plate (no DE → another example)
Row 4: egg / Ei (has DE → new vocab entry)
Row 1: break, broke, broken / brechen, brach, gebrochen
Row 2: a broken arm (no DE → example for "broken")
Row 3: a broken plate (no DE → example for "broken")
Row 4: egg / Ei (has DE → new vocab entry)
Rules (deterministic):
- A row is an "example row" if it has EN text but NO DE text
- It gets attached to the nearest preceding entry that HAS DE text
Rules (deterministic, generic):
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
- Find the best matching vocab entry by checking which entry's English words
appear in the example sentence (semantic matching via word overlap)
- Fall back to the nearest preceding entry if no word match found
- Multiple examples get joined with " | "
"""
if not entries:
return entries
result: List[Dict[str, Any]] = []
pending_examples: List[str] = []
# Separate into vocab entries (have DE) and example candidates (no DE)
vocab_entries: List[Dict[str, Any]] = []
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
for entry in entries:
en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip()
ex = (entry.get('example', '') or '').strip()
has_de = bool(de)
# Treat very short DE (≤2 chars) as OCR noise, not real translation
has_de = len(de) > 2
has_en = bool(en)
if has_en and not has_de and result:
# This is an example sentence — attach to last vocab entry
if has_en and not has_de and vocab_entries:
# This is an example sentence — find best matching vocab entry
example_text = en
if ex:
example_text = f"{en}{ex}"
pending_examples.append(example_text)
continue
# This is a real vocab entry
# First, flush any pending examples to the previous entry
if pending_examples and result:
prev = result[-1]
existing_ex = (prev.get('example', '') or '').strip()
new_examples = ' | '.join(pending_examples)
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
pending_examples = []
match_idx = _find_best_vocab_match(en, vocab_entries)
if match_idx < 0:
# No word match → fall back to last entry
match_idx = len(vocab_entries) - 1
result.append(entry)
if match_idx not in examples_for:
examples_for[match_idx] = []
examples_for[match_idx].append(example_text)
else:
vocab_entries.append(entry)
# Flush remaining examples
if pending_examples and result:
prev = result[-1]
existing_ex = (prev.get('example', '') or '').strip()
new_examples = ' | '.join(pending_examples)
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
# Attach examples to their matched vocab entries
for idx, example_list in examples_for.items():
if 0 <= idx < len(vocab_entries):
entry = vocab_entries[idx]
existing_ex = (entry.get('example', '') or '').strip()
new_examples = ' | '.join(example_list)
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
# Re-number
for i, e in enumerate(result):
for i, e in enumerate(vocab_entries):
e['row_index'] = i
return result
return vocab_entries
# --- D. Phonetic Bracket IPA Replacement ---
@@ -2794,10 +2843,12 @@ def build_word_grid(
for col in relevant_cols:
# Compute cell region: column x/width, row y/height
cell_x = col.x
cell_y = row.y
cell_w = col.width
cell_h = row.height
# Add padding to avoid clipping edge words
pad = 8 # pixels
cell_x = col.x - pad
cell_y = row.y - pad
cell_w = col.width + 2 * pad
cell_h = row.height + 2 * pad
# Clamp to image bounds
cell_x = max(0, cell_x)