""" Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary. - Character confusion fix (I/1/l/|) - Comma-separated word form splitting - Example sentence attachment to matching vocab entries Split from cv_ocr_engines.py for maintainability. """ import re from typing import Any, Dict, List # ============================================================================= # Post-Processing: Deterministic Quality Fixes # ============================================================================= # --- A. Character Confusion Fix (I/1/l) --- # Common OCR confusion pairs in vocabulary context _CHAR_CONFUSION_RULES = [ # "1" at word start followed by lowercase → likely "I" or "l" # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3") (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number) (re.compile(r'(? List[Dict[str, Any]]: """Fix common OCR character confusions using context. Deterministic rules: - "1" at word start → "I" or "l" based on context - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I" - "y " artifact at word boundaries → remove (e.g. "y you" → "you") """ for entry in entries: en = entry.get('english', '') or '' de = entry.get('german', '') or '' ex = entry.get('example', '') or '' # Apply general rules to all fields for pattern, replacement in _CHAR_CONFUSION_RULES: en = pattern.sub(replacement, en) de = pattern.sub(replacement, de) ex = pattern.sub(replacement, ex) # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I" de_lower_words = set(de.lower().replace(',', ' ').split()) if de_lower_words & _DE_INDICATORS_FOR_EN_I: # Any remaining "1" in EN that looks like "I" en = re.sub(r'\b1\b(?![\d.,])', 'I', en) # Fix "y " artifact before repeated word: "y you" → "you" en = re.sub(r'\by\s+([a-z])', r'\1', en) ex = re.sub(r'\by\s+([a-z])', r'\1', ex) entry['english'] = en.strip() entry['german'] = de.strip() entry['example'] = ex.strip() return entries # --- B. Comma-Separated Word Form Splitting --- def _is_singular_plural_pair(parts: List[str]) -> bool: """Detect if comma-separated parts are singular/plural forms of the same word. E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split). "break, broke, broken" → False (different verb forms, OK to split). Heuristic: exactly 2 parts that share a common prefix of >= 50% length, OR one part is a known plural suffix of the other (e.g. +s, +es, +en). """ if len(parts) != 2: return False a, b = parts[0].lower().strip(), parts[1].lower().strip() if not a or not b: return False # Common prefix heuristic: if words share >= 50% of the shorter word, # they are likely forms of the same word (Maus/Mäuse, child/children). min_len = min(len(a), len(b)) common = 0 for ca, cb in zip(a, b): if ca == cb: common += 1 else: break if common >= max(2, min_len * 0.5): return True # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü) umlaut_map = str.maketrans('aou', 'äöü') if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a: return True return False def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Split entries with comma-separated word forms into individual entries. E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen" → 3 entries: break/brechen, broke/brach, broken/gebrochen Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse" because those are forms of the same vocabulary entry. Only splits when both EN and DE have the same number of comma-parts, parts are short (word forms, not sentences), and at least 3 parts (to avoid splitting pairs that likely belong together). """ result: List[Dict[str, Any]] = [] for entry in entries: en = (entry.get('english', '') or '').strip() de = (entry.get('german', '') or '').strip() # Split by comma (but not inside brackets or parentheses) en_parts = _split_by_comma(en) de_parts = _split_by_comma(de) # Only split if we have multiple parts and counts match should_split = False if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts): # All parts must be short (word forms, not sentences) if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts): # Do NOT split singular/plural pairs (2 parts that are # forms of the same word) if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts): should_split = False else: should_split = True if not should_split: result.append(entry) continue # Split into individual entries for k in range(len(en_parts)): sub = dict(entry) # shallow copy sub['english'] = en_parts[k].strip() sub['german'] = de_parts[k].strip() if k < len(de_parts) else '' sub['example'] = '' # examples get attached later sub['split_from_comma'] = True result.append(sub) # Re-number for i, e in enumerate(result): e['row_index'] = i return result def _split_by_comma(text: str) -> List[str]: """Split text by commas, but not inside brackets [...] or parens (...).""" if ',' not in text: return [text] parts = [] depth_bracket = 0 depth_paren = 0 current = [] for ch in text: if ch == '[': depth_bracket += 1 elif ch == ']': depth_bracket = max(0, depth_bracket - 1) elif ch == '(': depth_paren += 1 elif ch == ')': depth_paren = max(0, depth_paren - 1) elif ch == ',' and depth_bracket == 0 and depth_paren == 0: parts.append(''.join(current).strip()) current = [] continue current.append(ch) if current: parts.append(''.join(current).strip()) # Filter empty parts return [p for p in parts if p] # --- C. Example Sentence Attachment --- def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int: """Find the vocab entry whose English word(s) best match the example sentence. Returns index into vocab_entries, or -1 if no match found. Uses word stem overlap: "a broken arm" matches "broken" or "break". """ if not vocab_entries or not example_text: return -1 example_lower = example_text.lower() example_words = set(re.findall(r'[a-zäöüß]+', example_lower)) best_idx = -1 best_score = 0 for i, entry in enumerate(vocab_entries): en = (entry.get('english', '') or '').lower() if not en: continue # Extract vocab words (split on space, comma, newline) vocab_words = set(re.findall(r'[a-zäöüß]+', en)) # Score: how many vocab words appear in the example? # Also check if example words share a common stem (first 4 chars) direct_matches = vocab_words & example_words score = len(direct_matches) * 10 # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre" if score == 0: for vw in vocab_words: if len(vw) < 3: continue stem = vw[:4] if len(vw) >= 4 else vw[:3] for ew in example_words: if len(ew) >= len(stem) and ew[:len(stem)] == stem: score += 5 break if score > best_score: best_score = score best_idx = i return best_idx if best_score > 0 else -1 def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Attach rows with EN text but no DE translation as examples to matching vocab entries. Vocabulary worksheets often have: Row 1: break, broke, broken / brechen, brach, gebrochen Row 2: a broken arm (no DE → example for "broken") Row 3: a broken plate (no DE → example for "broken") Row 4: egg / Ei (has DE → new vocab entry) Rules (deterministic, generic): - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars) - Find the best matching vocab entry by checking which entry's English words appear in the example sentence (semantic matching via word overlap) - Fall back to the nearest preceding entry if no word match found - Multiple examples get joined with " | " """ if not entries: return entries # Separate into vocab entries (have DE) and example candidates (no DE) vocab_entries: List[Dict[str, Any]] = [] examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts for entry in entries: en = (entry.get('english', '') or '').strip() de = (entry.get('german', '') or '').strip() ex = (entry.get('example', '') or '').strip() # Treat single-char DE as OCR noise, not real translation. # "Ei" (2 chars) is a valid German word, so threshold is 1. has_de = len(de) > 1 has_en = bool(en) # Heuristic: a row without DE is an "example sentence" only if # the EN text looks like a sentence (>= 4 words, or contains # typical sentence punctuation). Short EN text (1-3 words) is # more likely a vocab entry whose DE was missed by OCR. _looks_like_sentence = ( len(en.split()) >= 4 or en.rstrip().endswith(('.', '!', '?')) ) is_example_candidate = ( has_en and not has_de and _looks_like_sentence and vocab_entries ) if is_example_candidate: # This is an example sentence — find best matching vocab entry example_text = en match_idx = _find_best_vocab_match(en, vocab_entries) if match_idx < 0: # No word match → fall back to last entry match_idx = len(vocab_entries) - 1 if match_idx not in examples_for: examples_for[match_idx] = [] examples_for[match_idx].append(example_text) else: vocab_entries.append(entry) # Attach examples to their matched vocab entries for idx, example_list in examples_for.items(): if 0 <= idx < len(vocab_entries): entry = vocab_entries[idx] existing_ex = (entry.get('example', '') or '').strip() new_examples = ' | '.join(example_list) entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples # Re-number for i, e in enumerate(vocab_entries): e['row_index'] = i return vocab_entries