From ab294d5a6fc1c5fedee0cf69e4137510be60fd9e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 28 Feb 2026 21:00:09 +0100 Subject: [PATCH] feat(ocr-pipeline): deterministic post-processing pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 4 post-processing steps after OCR (no LLM needed): 1. Character confusion fix: I/1/l/| correction using cross-language context (if DE has "Ich", EN "1" → "I") 2. IPA dictionary replacement: detect [phonetics] brackets, look up correct IPA from eng_to_ipa (MIT, 134k words) — replaces OCR'd phonetic symbols with dictionary-correct transcription 3. Comma-split: "break, broke, broken" / "brechen, brach, gebrochen" → 3 individual entries when part counts match 4. Example sentence attachment: rows with EN but no DE translation get attached as examples to the preceding vocab entry All fixes are deterministic and generic — no hardcoded word lists. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 293 ++++++++++++++++++- 1 file changed, 290 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index cc2c6e8..40d658c 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -47,6 +47,20 @@ except ImportError: CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE +# --- IPA Dictionary --- + +IPA_AVAILABLE = False +_ipa_convert = None + +try: + import eng_to_ipa as _eng_to_ipa + _ipa_convert = _eng_to_ipa.convert + IPA_AVAILABLE = True + logger.info("eng_to_ipa available — IPA dictionary lookup enabled") +except ImportError: + logger.info("eng_to_ipa not installed — IPA replacement disabled") + +import re # --- Language Detection Constants --- @@ -2324,6 +2338,263 @@ def ocr_region_rapid( return words +# ============================================================================= +# Post-Processing: Deterministic Quality Fixes +# ============================================================================= + +# --- A. Character Confusion Fix (I/1/l) --- + +# Common OCR confusion pairs in vocabulary context +_CHAR_CONFUSION_RULES = [ + # "1" at word start followed by lowercase → likely "I" or "l" + (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1 want → I want + # Standalone "1" between words → "I" (English pronoun) + (re.compile(r'(? List[Dict[str, Any]]: + """Fix common OCR character confusions using context. + + Deterministic rules: + - "1" at word start → "I" or "l" based on context + - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I" + - "y " artifact at word boundaries → remove (e.g. "y you" → "you") + """ + for entry in entries: + en = entry.get('english', '') or '' + de = entry.get('german', '') or '' + ex = entry.get('example', '') or '' + + # Apply general rules to all fields + for pattern, replacement in _CHAR_CONFUSION_RULES: + en = pattern.sub(replacement, en) + de = pattern.sub(replacement, de) + ex = pattern.sub(replacement, ex) + + # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I" + de_lower_words = set(de.lower().replace(',', ' ').split()) + if de_lower_words & _DE_INDICATORS_FOR_EN_I: + # Any remaining "1" in EN that looks like "I" + en = re.sub(r'\b1\b', 'I', en) + + # Fix "y " artifact before repeated word: "y you" → "you" + en = re.sub(r'\by\s+([a-z])', r'\1', en) + ex = re.sub(r'\by\s+([a-z])', r'\1', ex) + + entry['english'] = en.strip() + entry['german'] = de.strip() + entry['example'] = ex.strip() + + return entries + + +# --- B. Comma-Separated Word Form Splitting --- + +def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Split entries with comma-separated word forms into individual entries. + + E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen" + → 3 entries: break/brechen, broke/brach, broken/gebrochen + + Only splits when both EN and DE have the same number of comma-parts, + or when one side has multiple and the other has exactly one. + """ + result: List[Dict[str, Any]] = [] + + for entry in entries: + en = (entry.get('english', '') or '').strip() + de = (entry.get('german', '') or '').strip() + + # Split by comma (but not inside brackets or parentheses) + en_parts = _split_by_comma(en) + de_parts = _split_by_comma(de) + + # Only split if we have multiple parts and counts match or one side is single + should_split = False + if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts): + # Both have same count — each part is a word form + # But only if parts are short (word forms, not sentences) + if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts): + should_split = True + + if not should_split: + result.append(entry) + continue + + # Split into individual entries + for k in range(len(en_parts)): + sub = dict(entry) # shallow copy + sub['english'] = en_parts[k].strip() + sub['german'] = de_parts[k].strip() if k < len(de_parts) else '' + sub['example'] = '' # examples get attached later + sub['split_from_comma'] = True + result.append(sub) + + # Re-number + for i, e in enumerate(result): + e['row_index'] = i + + return result + + +def _split_by_comma(text: str) -> List[str]: + """Split text by commas, but not inside brackets [...] or parens (...).""" + if ',' not in text: + return [text] + + parts = [] + depth_bracket = 0 + depth_paren = 0 + current = [] + + for ch in text: + if ch == '[': + depth_bracket += 1 + elif ch == ']': + depth_bracket = max(0, depth_bracket - 1) + elif ch == '(': + depth_paren += 1 + elif ch == ')': + depth_paren = max(0, depth_paren - 1) + elif ch == ',' and depth_bracket == 0 and depth_paren == 0: + parts.append(''.join(current).strip()) + current = [] + continue + current.append(ch) + + if current: + parts.append(''.join(current).strip()) + + # Filter empty parts + return [p for p in parts if p] + + +# --- C. Example Sentence Attachment --- + +def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Attach rows with EN text but no DE translation as examples to the preceding entry. + + Vocabulary worksheets often have: + Row 1: break / brechen + Row 2: a broken arm (no DE → this is an example for "break") + Row 3: a broken plate (no DE → another example) + Row 4: egg / Ei (has DE → new vocab entry) + + Rules (deterministic): + - A row is an "example row" if it has EN text but NO DE text + - It gets attached to the nearest preceding entry that HAS DE text + - Multiple examples get joined with " | " + """ + if not entries: + return entries + + result: List[Dict[str, Any]] = [] + pending_examples: List[str] = [] + + for entry in entries: + en = (entry.get('english', '') or '').strip() + de = (entry.get('german', '') or '').strip() + ex = (entry.get('example', '') or '').strip() + + has_de = bool(de) + has_en = bool(en) + + if has_en and not has_de and result: + # This is an example sentence — attach to last vocab entry + example_text = en + if ex: + example_text = f"{en} — {ex}" + pending_examples.append(example_text) + continue + + # This is a real vocab entry + # First, flush any pending examples to the previous entry + if pending_examples and result: + prev = result[-1] + existing_ex = (prev.get('example', '') or '').strip() + new_examples = ' | '.join(pending_examples) + prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples + pending_examples = [] + + result.append(entry) + + # Flush remaining examples + if pending_examples and result: + prev = result[-1] + existing_ex = (prev.get('example', '') or '').strip() + new_examples = ' | '.join(pending_examples) + prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples + + # Re-number + for i, e in enumerate(result): + e['row_index'] = i + + return result + + +# --- D. Phonetic Bracket IPA Replacement --- + +# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets +_PHONETIC_BRACKET_RE = re.compile( + r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]' +) + + +def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Replace OCR'd phonetic transcriptions with dictionary IPA. + + Detects patterns like "dance [du:ns]" and replaces with "dance [dæns]" + using eng_to_ipa dictionary lookup. + + Only replaces if: + - The word before brackets is found in the IPA dictionary + - The bracket content looks like phonetics (not regular text) + """ + if not IPA_AVAILABLE or _ipa_convert is None: + return entries + + for entry in entries: + for field in ('english', 'german', 'example'): + text = entry.get(field, '') or '' + if '[' not in text: + continue + entry[field] = _replace_phonetics_in_text(text) + + return entries + + +def _replace_phonetics_in_text(text: str) -> str: + """Replace [phonetic] after words with dictionary IPA.""" + if not IPA_AVAILABLE or _ipa_convert is None: + return text + + def replacer(match): + word = match.group(1) + ocr_phonetic = match.group(2) + + # Skip if bracket content looks like regular text (has spaces + capitals) + if len(ocr_phonetic.split()) > 3: + return match.group(0) # Keep original + + # Look up in IPA dictionary + ipa = _ipa_convert(word.lower()) + + # eng_to_ipa returns word with * if not found + if '*' in ipa or not ipa: + return match.group(0) # Keep original + + # Clean up: eng_to_ipa returns bare IPA, we add brackets + return f"{word} [{ipa}]" + + return _PHONETIC_BRACKET_RE.sub(replacer, text) + + def _split_oversized_entries( entries: List[Dict[str, Any]], content_rows: List[RowGeometry], @@ -2591,12 +2862,28 @@ def build_word_grid( if entry['english'] or entry['german'] or entry['example']: entries.append(entry) - # --- Post-processing: split oversized rows --- + # --- Post-processing pipeline (deterministic, no LLM) --- + n_raw = len(entries) + + # 1. Split oversized rows (missed Step 4 boundaries) entries = _split_oversized_entries(entries, content_rows, img_w, img_h) + # 2. Fix character confusion (I/1/l based on context) + entries = _fix_character_confusion(entries) + + # 3. Replace OCR'd phonetics with dictionary IPA + entries = _fix_phonetic_brackets(entries) + + # 4. Split comma-separated word forms (break, broke, broken → 3 entries) + entries = _split_comma_entries(entries) + + # 5. Attach example sentences (rows without DE → examples for preceding entry) + entries = _attach_example_sentences(entries) + logger.info(f"build_word_grid: {len(entries)} entries from " - f"{len(content_rows)} content rows × {len(relevant_cols)} columns " - f"(engine={engine_name})") + f"{n_raw} raw → {len(entries)} after post-processing " + f"({len(content_rows)} content rows × {len(relevant_cols)} columns, " + f"engine={engine_name})") return entries