From 010616be5a711d1b08db84f5f7400e6090f8ff8e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 28 Feb 2026 21:24:28 +0100 Subject: [PATCH] fix(ocr-pipeline): generic example attachment + cell padding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Semantic example matching: instead of attaching example sentences to the immediately preceding entry, find the vocab entry whose English word(s) appear in the example. "a broken arm" → matches "broken" via word overlap, not "egg/Ei". Uses stem matching for word form variants (break/broken share stem "bro"). 2. Cell padding: add 8px padding to each cell region so words at column/row edges don't get clipped by OCR (fixes "er wollte" missing at cell boundaries). 3. Treat very short DE text (≤2 chars) as OCR noise, not real translation — prevents false positives in example detection. All fixes are generic and deterministic. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 123 +++++++++++++------ 1 file changed, 87 insertions(+), 36 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 40d658c..0cd7cbe 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2477,65 +2477,114 @@ def _split_by_comma(text: str) -> List[str]: # --- C. Example Sentence Attachment --- +def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int: + """Find the vocab entry whose English word(s) best match the example sentence. + + Returns index into vocab_entries, or -1 if no match found. + Uses word stem overlap: "a broken arm" matches "broken" or "break". + """ + if not vocab_entries or not example_text: + return -1 + + example_lower = example_text.lower() + example_words = set(re.findall(r'[a-zäöüß]+', example_lower)) + + best_idx = -1 + best_score = 0 + + for i, entry in enumerate(vocab_entries): + en = (entry.get('english', '') or '').lower() + if not en: + continue + + # Extract vocab words (split on space, comma, newline) + vocab_words = set(re.findall(r'[a-zäöüß]+', en)) + + # Score: how many vocab words appear in the example? + # Also check if example words share a common stem (first 4 chars) + direct_matches = vocab_words & example_words + score = len(direct_matches) * 10 + + # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre" + if score == 0: + for vw in vocab_words: + if len(vw) < 3: + continue + stem = vw[:4] if len(vw) >= 4 else vw[:3] + for ew in example_words: + if len(ew) >= len(stem) and ew[:len(stem)] == stem: + score += 5 + break + + if score > best_score: + best_score = score + best_idx = i + + return best_idx if best_score > 0 else -1 + + def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Attach rows with EN text but no DE translation as examples to the preceding entry. + """Attach rows with EN text but no DE translation as examples to matching vocab entries. Vocabulary worksheets often have: - Row 1: break / brechen - Row 2: a broken arm (no DE → this is an example for "break") - Row 3: a broken plate (no DE → another example) - Row 4: egg / Ei (has DE → new vocab entry) + Row 1: break, broke, broken / brechen, brach, gebrochen + Row 2: a broken arm (no DE → example for "broken") + Row 3: a broken plate (no DE → example for "broken") + Row 4: egg / Ei (has DE → new vocab entry) - Rules (deterministic): - - A row is an "example row" if it has EN text but NO DE text - - It gets attached to the nearest preceding entry that HAS DE text + Rules (deterministic, generic): + - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars) + - Find the best matching vocab entry by checking which entry's English words + appear in the example sentence (semantic matching via word overlap) + - Fall back to the nearest preceding entry if no word match found - Multiple examples get joined with " | " """ if not entries: return entries - result: List[Dict[str, Any]] = [] - pending_examples: List[str] = [] + # Separate into vocab entries (have DE) and example candidates (no DE) + vocab_entries: List[Dict[str, Any]] = [] + examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts for entry in entries: en = (entry.get('english', '') or '').strip() de = (entry.get('german', '') or '').strip() ex = (entry.get('example', '') or '').strip() - has_de = bool(de) + # Treat very short DE (≤2 chars) as OCR noise, not real translation + has_de = len(de) > 2 has_en = bool(en) - if has_en and not has_de and result: - # This is an example sentence — attach to last vocab entry + if has_en and not has_de and vocab_entries: + # This is an example sentence — find best matching vocab entry example_text = en if ex: example_text = f"{en} — {ex}" - pending_examples.append(example_text) - continue - # This is a real vocab entry - # First, flush any pending examples to the previous entry - if pending_examples and result: - prev = result[-1] - existing_ex = (prev.get('example', '') or '').strip() - new_examples = ' | '.join(pending_examples) - prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples - pending_examples = [] + match_idx = _find_best_vocab_match(en, vocab_entries) + if match_idx < 0: + # No word match → fall back to last entry + match_idx = len(vocab_entries) - 1 - result.append(entry) + if match_idx not in examples_for: + examples_for[match_idx] = [] + examples_for[match_idx].append(example_text) + else: + vocab_entries.append(entry) - # Flush remaining examples - if pending_examples and result: - prev = result[-1] - existing_ex = (prev.get('example', '') or '').strip() - new_examples = ' | '.join(pending_examples) - prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples + # Attach examples to their matched vocab entries + for idx, example_list in examples_for.items(): + if 0 <= idx < len(vocab_entries): + entry = vocab_entries[idx] + existing_ex = (entry.get('example', '') or '').strip() + new_examples = ' | '.join(example_list) + entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples # Re-number - for i, e in enumerate(result): + for i, e in enumerate(vocab_entries): e['row_index'] = i - return result + return vocab_entries # --- D. Phonetic Bracket IPA Replacement --- @@ -2794,10 +2843,12 @@ def build_word_grid( for col in relevant_cols: # Compute cell region: column x/width, row y/height - cell_x = col.x - cell_y = row.y - cell_w = col.width - cell_h = row.height + # Add padding to avoid clipping edge words + pad = 8 # pixels + cell_x = col.x - pad + cell_y = row.y - pad + cell_w = col.width + 2 * pad + cell_h = row.height + 2 * pad # Clamp to image bounds cell_x = max(0, cell_x)