fix(ocr-pipeline): generic example attachment + cell padding
1. Semantic example matching: instead of attaching example sentences to the immediately preceding entry, find the vocab entry whose English word(s) appear in the example. "a broken arm" → matches "broken" via word overlap, not "egg/Ei". Uses stem matching for word form variants (break/broken share stem "bro"). 2. Cell padding: add 8px padding to each cell region so words at column/row edges don't get clipped by OCR (fixes "er wollte" missing at cell boundaries). 3. Treat very short DE text (≤2 chars) as OCR noise, not real translation — prevents false positives in example detection. All fixes are generic and deterministic. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2477,65 +2477,114 @@ def _split_by_comma(text: str) -> List[str]:
|
||||
|
||||
# --- C. Example Sentence Attachment ---
|
||||
|
||||
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
|
||||
"""Find the vocab entry whose English word(s) best match the example sentence.
|
||||
|
||||
Returns index into vocab_entries, or -1 if no match found.
|
||||
Uses word stem overlap: "a broken arm" matches "broken" or "break".
|
||||
"""
|
||||
if not vocab_entries or not example_text:
|
||||
return -1
|
||||
|
||||
example_lower = example_text.lower()
|
||||
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
|
||||
|
||||
best_idx = -1
|
||||
best_score = 0
|
||||
|
||||
for i, entry in enumerate(vocab_entries):
|
||||
en = (entry.get('english', '') or '').lower()
|
||||
if not en:
|
||||
continue
|
||||
|
||||
# Extract vocab words (split on space, comma, newline)
|
||||
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
|
||||
|
||||
# Score: how many vocab words appear in the example?
|
||||
# Also check if example words share a common stem (first 4 chars)
|
||||
direct_matches = vocab_words & example_words
|
||||
score = len(direct_matches) * 10
|
||||
|
||||
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
|
||||
if score == 0:
|
||||
for vw in vocab_words:
|
||||
if len(vw) < 3:
|
||||
continue
|
||||
stem = vw[:4] if len(vw) >= 4 else vw[:3]
|
||||
for ew in example_words:
|
||||
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
|
||||
score += 5
|
||||
break
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_idx = i
|
||||
|
||||
return best_idx if best_score > 0 else -1
|
||||
|
||||
|
||||
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Attach rows with EN text but no DE translation as examples to the preceding entry.
|
||||
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
|
||||
|
||||
Vocabulary worksheets often have:
|
||||
Row 1: break / brechen
|
||||
Row 2: a broken arm (no DE → this is an example for "break")
|
||||
Row 3: a broken plate (no DE → another example)
|
||||
Row 4: egg / Ei (has DE → new vocab entry)
|
||||
Row 1: break, broke, broken / brechen, brach, gebrochen
|
||||
Row 2: a broken arm (no DE → example for "broken")
|
||||
Row 3: a broken plate (no DE → example for "broken")
|
||||
Row 4: egg / Ei (has DE → new vocab entry)
|
||||
|
||||
Rules (deterministic):
|
||||
- A row is an "example row" if it has EN text but NO DE text
|
||||
- It gets attached to the nearest preceding entry that HAS DE text
|
||||
Rules (deterministic, generic):
|
||||
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
|
||||
- Find the best matching vocab entry by checking which entry's English words
|
||||
appear in the example sentence (semantic matching via word overlap)
|
||||
- Fall back to the nearest preceding entry if no word match found
|
||||
- Multiple examples get joined with " | "
|
||||
"""
|
||||
if not entries:
|
||||
return entries
|
||||
|
||||
result: List[Dict[str, Any]] = []
|
||||
pending_examples: List[str] = []
|
||||
# Separate into vocab entries (have DE) and example candidates (no DE)
|
||||
vocab_entries: List[Dict[str, Any]] = []
|
||||
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
|
||||
|
||||
for entry in entries:
|
||||
en = (entry.get('english', '') or '').strip()
|
||||
de = (entry.get('german', '') or '').strip()
|
||||
ex = (entry.get('example', '') or '').strip()
|
||||
|
||||
has_de = bool(de)
|
||||
# Treat very short DE (≤2 chars) as OCR noise, not real translation
|
||||
has_de = len(de) > 2
|
||||
has_en = bool(en)
|
||||
|
||||
if has_en and not has_de and result:
|
||||
# This is an example sentence — attach to last vocab entry
|
||||
if has_en and not has_de and vocab_entries:
|
||||
# This is an example sentence — find best matching vocab entry
|
||||
example_text = en
|
||||
if ex:
|
||||
example_text = f"{en} — {ex}"
|
||||
pending_examples.append(example_text)
|
||||
continue
|
||||
|
||||
# This is a real vocab entry
|
||||
# First, flush any pending examples to the previous entry
|
||||
if pending_examples and result:
|
||||
prev = result[-1]
|
||||
existing_ex = (prev.get('example', '') or '').strip()
|
||||
new_examples = ' | '.join(pending_examples)
|
||||
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||
pending_examples = []
|
||||
match_idx = _find_best_vocab_match(en, vocab_entries)
|
||||
if match_idx < 0:
|
||||
# No word match → fall back to last entry
|
||||
match_idx = len(vocab_entries) - 1
|
||||
|
||||
result.append(entry)
|
||||
if match_idx not in examples_for:
|
||||
examples_for[match_idx] = []
|
||||
examples_for[match_idx].append(example_text)
|
||||
else:
|
||||
vocab_entries.append(entry)
|
||||
|
||||
# Flush remaining examples
|
||||
if pending_examples and result:
|
||||
prev = result[-1]
|
||||
existing_ex = (prev.get('example', '') or '').strip()
|
||||
new_examples = ' | '.join(pending_examples)
|
||||
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||
# Attach examples to their matched vocab entries
|
||||
for idx, example_list in examples_for.items():
|
||||
if 0 <= idx < len(vocab_entries):
|
||||
entry = vocab_entries[idx]
|
||||
existing_ex = (entry.get('example', '') or '').strip()
|
||||
new_examples = ' | '.join(example_list)
|
||||
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||
|
||||
# Re-number
|
||||
for i, e in enumerate(result):
|
||||
for i, e in enumerate(vocab_entries):
|
||||
e['row_index'] = i
|
||||
|
||||
return result
|
||||
return vocab_entries
|
||||
|
||||
|
||||
# --- D. Phonetic Bracket IPA Replacement ---
|
||||
@@ -2794,10 +2843,12 @@ def build_word_grid(
|
||||
|
||||
for col in relevant_cols:
|
||||
# Compute cell region: column x/width, row y/height
|
||||
cell_x = col.x
|
||||
cell_y = row.y
|
||||
cell_w = col.width
|
||||
cell_h = row.height
|
||||
# Add padding to avoid clipping edge words
|
||||
pad = 8 # pixels
|
||||
cell_x = col.x - pad
|
||||
cell_y = row.y - pad
|
||||
cell_w = col.width + 2 * pad
|
||||
cell_h = row.height + 2 * pad
|
||||
|
||||
# Clamp to image bounds
|
||||
cell_x = max(0, cell_x)
|
||||
|
||||
Reference in New Issue
Block a user