feat(ocr-pipeline): deterministic post-processing pipeline
Add 4 post-processing steps after OCR (no LLM needed): 1. Character confusion fix: I/1/l/| correction using cross-language context (if DE has "Ich", EN "1" → "I") 2. IPA dictionary replacement: detect [phonetics] brackets, look up correct IPA from eng_to_ipa (MIT, 134k words) — replaces OCR'd phonetic symbols with dictionary-correct transcription 3. Comma-split: "break, broke, broken" / "brechen, brach, gebrochen" → 3 individual entries when part counts match 4. Example sentence attachment: rows with EN but no DE translation get attached as examples to the preceding vocab entry All fixes are deterministic and generic — no hardcoded word lists. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -47,6 +47,20 @@ except ImportError:
|
|||||||
|
|
||||||
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
||||||
|
|
||||||
|
# --- IPA Dictionary ---
|
||||||
|
|
||||||
|
IPA_AVAILABLE = False
|
||||||
|
_ipa_convert = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import eng_to_ipa as _eng_to_ipa
|
||||||
|
_ipa_convert = _eng_to_ipa.convert
|
||||||
|
IPA_AVAILABLE = True
|
||||||
|
logger.info("eng_to_ipa available — IPA dictionary lookup enabled")
|
||||||
|
except ImportError:
|
||||||
|
logger.info("eng_to_ipa not installed — IPA replacement disabled")
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
# --- Language Detection Constants ---
|
# --- Language Detection Constants ---
|
||||||
|
|
||||||
@@ -2324,6 +2338,263 @@ def ocr_region_rapid(
|
|||||||
return words
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Post-Processing: Deterministic Quality Fixes
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# --- A. Character Confusion Fix (I/1/l) ---
|
||||||
|
|
||||||
|
# Common OCR confusion pairs in vocabulary context
|
||||||
|
_CHAR_CONFUSION_RULES = [
|
||||||
|
# "1" at word start followed by lowercase → likely "I" or "l"
|
||||||
|
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1 want → I want
|
||||||
|
# Standalone "1" between words → "I" (English pronoun)
|
||||||
|
(re.compile(r'(?<!\d)\b1\b(?!\d)'), 'I'), # "1 want" → "I want"
|
||||||
|
# "|" used as "I" or "l"
|
||||||
|
(re.compile(r'(?<!\|)\|(?!\|)'), 'I'), # |ch → Ich
|
||||||
|
]
|
||||||
|
|
||||||
|
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||||||
|
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
"""Fix common OCR character confusions using context.
|
||||||
|
|
||||||
|
Deterministic rules:
|
||||||
|
- "1" at word start → "I" or "l" based on context
|
||||||
|
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
|
||||||
|
- "y " artifact at word boundaries → remove (e.g. "y you" → "you")
|
||||||
|
"""
|
||||||
|
for entry in entries:
|
||||||
|
en = entry.get('english', '') or ''
|
||||||
|
de = entry.get('german', '') or ''
|
||||||
|
ex = entry.get('example', '') or ''
|
||||||
|
|
||||||
|
# Apply general rules to all fields
|
||||||
|
for pattern, replacement in _CHAR_CONFUSION_RULES:
|
||||||
|
en = pattern.sub(replacement, en)
|
||||||
|
de = pattern.sub(replacement, de)
|
||||||
|
ex = pattern.sub(replacement, ex)
|
||||||
|
|
||||||
|
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
|
||||||
|
de_lower_words = set(de.lower().replace(',', ' ').split())
|
||||||
|
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
|
||||||
|
# Any remaining "1" in EN that looks like "I"
|
||||||
|
en = re.sub(r'\b1\b', 'I', en)
|
||||||
|
|
||||||
|
# Fix "y " artifact before repeated word: "y you" → "you"
|
||||||
|
en = re.sub(r'\by\s+([a-z])', r'\1', en)
|
||||||
|
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
|
||||||
|
|
||||||
|
entry['english'] = en.strip()
|
||||||
|
entry['german'] = de.strip()
|
||||||
|
entry['example'] = ex.strip()
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
# --- B. Comma-Separated Word Form Splitting ---
|
||||||
|
|
||||||
|
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
"""Split entries with comma-separated word forms into individual entries.
|
||||||
|
|
||||||
|
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
||||||
|
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
||||||
|
|
||||||
|
Only splits when both EN and DE have the same number of comma-parts,
|
||||||
|
or when one side has multiple and the other has exactly one.
|
||||||
|
"""
|
||||||
|
result: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
en = (entry.get('english', '') or '').strip()
|
||||||
|
de = (entry.get('german', '') or '').strip()
|
||||||
|
|
||||||
|
# Split by comma (but not inside brackets or parentheses)
|
||||||
|
en_parts = _split_by_comma(en)
|
||||||
|
de_parts = _split_by_comma(de)
|
||||||
|
|
||||||
|
# Only split if we have multiple parts and counts match or one side is single
|
||||||
|
should_split = False
|
||||||
|
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
||||||
|
# Both have same count — each part is a word form
|
||||||
|
# But only if parts are short (word forms, not sentences)
|
||||||
|
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
||||||
|
should_split = True
|
||||||
|
|
||||||
|
if not should_split:
|
||||||
|
result.append(entry)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Split into individual entries
|
||||||
|
for k in range(len(en_parts)):
|
||||||
|
sub = dict(entry) # shallow copy
|
||||||
|
sub['english'] = en_parts[k].strip()
|
||||||
|
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
|
||||||
|
sub['example'] = '' # examples get attached later
|
||||||
|
sub['split_from_comma'] = True
|
||||||
|
result.append(sub)
|
||||||
|
|
||||||
|
# Re-number
|
||||||
|
for i, e in enumerate(result):
|
||||||
|
e['row_index'] = i
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _split_by_comma(text: str) -> List[str]:
|
||||||
|
"""Split text by commas, but not inside brackets [...] or parens (...)."""
|
||||||
|
if ',' not in text:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
depth_bracket = 0
|
||||||
|
depth_paren = 0
|
||||||
|
current = []
|
||||||
|
|
||||||
|
for ch in text:
|
||||||
|
if ch == '[':
|
||||||
|
depth_bracket += 1
|
||||||
|
elif ch == ']':
|
||||||
|
depth_bracket = max(0, depth_bracket - 1)
|
||||||
|
elif ch == '(':
|
||||||
|
depth_paren += 1
|
||||||
|
elif ch == ')':
|
||||||
|
depth_paren = max(0, depth_paren - 1)
|
||||||
|
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
|
||||||
|
parts.append(''.join(current).strip())
|
||||||
|
current = []
|
||||||
|
continue
|
||||||
|
current.append(ch)
|
||||||
|
|
||||||
|
if current:
|
||||||
|
parts.append(''.join(current).strip())
|
||||||
|
|
||||||
|
# Filter empty parts
|
||||||
|
return [p for p in parts if p]
|
||||||
|
|
||||||
|
|
||||||
|
# --- C. Example Sentence Attachment ---
|
||||||
|
|
||||||
|
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
"""Attach rows with EN text but no DE translation as examples to the preceding entry.
|
||||||
|
|
||||||
|
Vocabulary worksheets often have:
|
||||||
|
Row 1: break / brechen
|
||||||
|
Row 2: a broken arm (no DE → this is an example for "break")
|
||||||
|
Row 3: a broken plate (no DE → another example)
|
||||||
|
Row 4: egg / Ei (has DE → new vocab entry)
|
||||||
|
|
||||||
|
Rules (deterministic):
|
||||||
|
- A row is an "example row" if it has EN text but NO DE text
|
||||||
|
- It gets attached to the nearest preceding entry that HAS DE text
|
||||||
|
- Multiple examples get joined with " | "
|
||||||
|
"""
|
||||||
|
if not entries:
|
||||||
|
return entries
|
||||||
|
|
||||||
|
result: List[Dict[str, Any]] = []
|
||||||
|
pending_examples: List[str] = []
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
en = (entry.get('english', '') or '').strip()
|
||||||
|
de = (entry.get('german', '') or '').strip()
|
||||||
|
ex = (entry.get('example', '') or '').strip()
|
||||||
|
|
||||||
|
has_de = bool(de)
|
||||||
|
has_en = bool(en)
|
||||||
|
|
||||||
|
if has_en and not has_de and result:
|
||||||
|
# This is an example sentence — attach to last vocab entry
|
||||||
|
example_text = en
|
||||||
|
if ex:
|
||||||
|
example_text = f"{en} — {ex}"
|
||||||
|
pending_examples.append(example_text)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# This is a real vocab entry
|
||||||
|
# First, flush any pending examples to the previous entry
|
||||||
|
if pending_examples and result:
|
||||||
|
prev = result[-1]
|
||||||
|
existing_ex = (prev.get('example', '') or '').strip()
|
||||||
|
new_examples = ' | '.join(pending_examples)
|
||||||
|
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||||
|
pending_examples = []
|
||||||
|
|
||||||
|
result.append(entry)
|
||||||
|
|
||||||
|
# Flush remaining examples
|
||||||
|
if pending_examples and result:
|
||||||
|
prev = result[-1]
|
||||||
|
existing_ex = (prev.get('example', '') or '').strip()
|
||||||
|
new_examples = ' | '.join(pending_examples)
|
||||||
|
prev['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||||
|
|
||||||
|
# Re-number
|
||||||
|
for i, e in enumerate(result):
|
||||||
|
e['row_index'] = i
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# --- D. Phonetic Bracket IPA Replacement ---
|
||||||
|
|
||||||
|
# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets
|
||||||
|
_PHONETIC_BRACKET_RE = re.compile(
|
||||||
|
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_phonetic_brackets(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||||||
|
|
||||||
|
Detects patterns like "dance [du:ns]" and replaces with "dance [dæns]"
|
||||||
|
using eng_to_ipa dictionary lookup.
|
||||||
|
|
||||||
|
Only replaces if:
|
||||||
|
- The word before brackets is found in the IPA dictionary
|
||||||
|
- The bracket content looks like phonetics (not regular text)
|
||||||
|
"""
|
||||||
|
if not IPA_AVAILABLE or _ipa_convert is None:
|
||||||
|
return entries
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
for field in ('english', 'german', 'example'):
|
||||||
|
text = entry.get(field, '') or ''
|
||||||
|
if '[' not in text:
|
||||||
|
continue
|
||||||
|
entry[field] = _replace_phonetics_in_text(text)
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_phonetics_in_text(text: str) -> str:
|
||||||
|
"""Replace [phonetic] after words with dictionary IPA."""
|
||||||
|
if not IPA_AVAILABLE or _ipa_convert is None:
|
||||||
|
return text
|
||||||
|
|
||||||
|
def replacer(match):
|
||||||
|
word = match.group(1)
|
||||||
|
ocr_phonetic = match.group(2)
|
||||||
|
|
||||||
|
# Skip if bracket content looks like regular text (has spaces + capitals)
|
||||||
|
if len(ocr_phonetic.split()) > 3:
|
||||||
|
return match.group(0) # Keep original
|
||||||
|
|
||||||
|
# Look up in IPA dictionary
|
||||||
|
ipa = _ipa_convert(word.lower())
|
||||||
|
|
||||||
|
# eng_to_ipa returns word with * if not found
|
||||||
|
if '*' in ipa or not ipa:
|
||||||
|
return match.group(0) # Keep original
|
||||||
|
|
||||||
|
# Clean up: eng_to_ipa returns bare IPA, we add brackets
|
||||||
|
return f"{word} [{ipa}]"
|
||||||
|
|
||||||
|
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||||
|
|
||||||
|
|
||||||
def _split_oversized_entries(
|
def _split_oversized_entries(
|
||||||
entries: List[Dict[str, Any]],
|
entries: List[Dict[str, Any]],
|
||||||
content_rows: List[RowGeometry],
|
content_rows: List[RowGeometry],
|
||||||
@@ -2591,12 +2862,28 @@ def build_word_grid(
|
|||||||
if entry['english'] or entry['german'] or entry['example']:
|
if entry['english'] or entry['german'] or entry['example']:
|
||||||
entries.append(entry)
|
entries.append(entry)
|
||||||
|
|
||||||
# --- Post-processing: split oversized rows ---
|
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||||
|
n_raw = len(entries)
|
||||||
|
|
||||||
|
# 1. Split oversized rows (missed Step 4 boundaries)
|
||||||
entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
|
entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
|
||||||
|
|
||||||
|
# 2. Fix character confusion (I/1/l based on context)
|
||||||
|
entries = _fix_character_confusion(entries)
|
||||||
|
|
||||||
|
# 3. Replace OCR'd phonetics with dictionary IPA
|
||||||
|
entries = _fix_phonetic_brackets(entries)
|
||||||
|
|
||||||
|
# 4. Split comma-separated word forms (break, broke, broken → 3 entries)
|
||||||
|
entries = _split_comma_entries(entries)
|
||||||
|
|
||||||
|
# 5. Attach example sentences (rows without DE → examples for preceding entry)
|
||||||
|
entries = _attach_example_sentences(entries)
|
||||||
|
|
||||||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||||||
f"{len(content_rows)} content rows × {len(relevant_cols)} columns "
|
f"{n_raw} raw → {len(entries)} after post-processing "
|
||||||
f"(engine={engine_name})")
|
f"({len(content_rows)} content rows × {len(relevant_cols)} columns, "
|
||||||
|
f"engine={engine_name})")
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user