refactor(ocr-pipeline): make post-processing fully generic
Three non-generic solutions replaced with universal heuristics: 1. Cell-OCR fallback: instead of restricting to column_en/column_de, now checks pixel density (>2% dark pixels) for ANY column type. Truly empty cells are skipped without running Tesseract. 2. Example-sentence detection: instead of checking for example-column text (worksheet-specific), now uses sentence heuristics (>=4 words or ends with sentence punctuation). Short EN text without DE is kept as a vocab entry (OCR may have missed the translation). 3. Comma-split: re-enabled with singular/plural detection. Pairs like "mouse, mice" / "Maus, Mäuse" are kept together. Verb forms like "break, broke, broken" are still split into individual entries. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2713,14 +2713,54 @@ def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, An
|
|||||||
|
|
||||||
# --- B. Comma-Separated Word Form Splitting ---
|
# --- B. Comma-Separated Word Form Splitting ---
|
||||||
|
|
||||||
|
def _is_singular_plural_pair(parts: List[str]) -> bool:
|
||||||
|
"""Detect if comma-separated parts are singular/plural forms of the same word.
|
||||||
|
|
||||||
|
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
|
||||||
|
"break, broke, broken" → False (different verb forms, OK to split).
|
||||||
|
|
||||||
|
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
|
||||||
|
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
|
||||||
|
"""
|
||||||
|
if len(parts) != 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
a, b = parts[0].lower().strip(), parts[1].lower().strip()
|
||||||
|
if not a or not b:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Common prefix heuristic: if words share >= 50% of the shorter word,
|
||||||
|
# they are likely forms of the same word (Maus/Mäuse, child/children).
|
||||||
|
min_len = min(len(a), len(b))
|
||||||
|
common = 0
|
||||||
|
for ca, cb in zip(a, b):
|
||||||
|
if ca == cb:
|
||||||
|
common += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if common >= max(2, min_len * 0.5):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
|
||||||
|
umlaut_map = str.maketrans('aou', 'äöü')
|
||||||
|
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
"""Split entries with comma-separated word forms into individual entries.
|
"""Split entries with comma-separated word forms into individual entries.
|
||||||
|
|
||||||
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
||||||
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
||||||
|
|
||||||
|
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
|
||||||
|
because those are forms of the same vocabulary entry.
|
||||||
|
|
||||||
Only splits when both EN and DE have the same number of comma-parts,
|
Only splits when both EN and DE have the same number of comma-parts,
|
||||||
or when one side has multiple and the other has exactly one.
|
parts are short (word forms, not sentences), and at least 3 parts
|
||||||
|
(to avoid splitting pairs that likely belong together).
|
||||||
"""
|
"""
|
||||||
result: List[Dict[str, Any]] = []
|
result: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
@@ -2732,13 +2772,17 @@ def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|||||||
en_parts = _split_by_comma(en)
|
en_parts = _split_by_comma(en)
|
||||||
de_parts = _split_by_comma(de)
|
de_parts = _split_by_comma(de)
|
||||||
|
|
||||||
# Only split if we have multiple parts and counts match or one side is single
|
# Only split if we have multiple parts and counts match
|
||||||
should_split = False
|
should_split = False
|
||||||
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
||||||
# Both have same count — each part is a word form
|
# All parts must be short (word forms, not sentences)
|
||||||
# But only if parts are short (word forms, not sentences)
|
|
||||||
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
||||||
should_split = True
|
# Do NOT split singular/plural pairs (2 parts that are
|
||||||
|
# forms of the same word)
|
||||||
|
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
|
||||||
|
should_split = False
|
||||||
|
else:
|
||||||
|
should_split = True
|
||||||
|
|
||||||
if not should_split:
|
if not should_split:
|
||||||
result.append(entry)
|
result.append(entry)
|
||||||
@@ -2872,13 +2916,18 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
|
|||||||
# "Ei" (2 chars) is a valid German word, so threshold is 1.
|
# "Ei" (2 chars) is a valid German word, so threshold is 1.
|
||||||
has_de = len(de) > 1
|
has_de = len(de) > 1
|
||||||
has_en = bool(en)
|
has_en = bool(en)
|
||||||
has_ex = bool(ex)
|
|
||||||
|
|
||||||
# A row is an example candidate ONLY if it has EN text but
|
# Heuristic: a row without DE is an "example sentence" only if
|
||||||
# NO DE translation AND NO example-column text. Rows with
|
# the EN text looks like a sentence (>= 4 words, or contains
|
||||||
# text in the example column are real vocab entries (e.g.
|
# typical sentence punctuation). Short EN text (1-3 words) is
|
||||||
# continuation lines like "stand ..." / "German: Ich möchte...").
|
# more likely a vocab entry whose DE was missed by OCR.
|
||||||
is_example_candidate = has_en and not has_de and not has_ex and vocab_entries
|
_looks_like_sentence = (
|
||||||
|
len(en.split()) >= 4
|
||||||
|
or en.rstrip().endswith(('.', '!', '?'))
|
||||||
|
)
|
||||||
|
is_example_candidate = (
|
||||||
|
has_en and not has_de and _looks_like_sentence and vocab_entries
|
||||||
|
)
|
||||||
|
|
||||||
if is_example_candidate:
|
if is_example_candidate:
|
||||||
# This is an example sentence — find best matching vocab entry
|
# This is an example sentence — find best matching vocab entry
|
||||||
@@ -3127,12 +3176,20 @@ def _ocr_single_cell(
|
|||||||
# --- FALLBACK: Cell-OCR for empty cells ---
|
# --- FALLBACK: Cell-OCR for empty cells ---
|
||||||
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
|
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
|
||||||
# Re-run OCR on the cell crop to catch what word-lookup missed.
|
# Re-run OCR on the cell crop to catch what word-lookup missed.
|
||||||
# Only run fallback for EN/DE columns (where vocab words are expected).
|
# To avoid wasting time on truly empty cells, check pixel density first:
|
||||||
# Example columns are often legitimately empty and running Tesseract on
|
# only run Tesseract if the cell crop contains enough dark pixels to
|
||||||
# all of them wastes ~10s. column_example cells stay empty if word-lookup
|
# plausibly contain text.
|
||||||
# found nothing.
|
_run_fallback = False
|
||||||
_fallback_col_types = {'column_en', 'column_de'}
|
if not text.strip() and cell_w > 0 and cell_h > 0:
|
||||||
if not text.strip() and cell_w > 0 and cell_h > 0 and col.type in _fallback_col_types:
|
# Quick pixel-density check: binarise the cell crop and count
|
||||||
|
# dark pixels. Text cells typically have >2% ink coverage.
|
||||||
|
if ocr_img is not None:
|
||||||
|
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||||||
|
if crop.size > 0:
|
||||||
|
# Threshold: pixels darker than 180 (on 0-255 grayscale)
|
||||||
|
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||||
|
_run_fallback = dark_ratio > 0.02
|
||||||
|
if _run_fallback:
|
||||||
cell_region = PageRegion(
|
cell_region = PageRegion(
|
||||||
type=col.type,
|
type=col.type,
|
||||||
x=cell_x, y=cell_y,
|
x=cell_x, y=cell_y,
|
||||||
|
|||||||
@@ -1179,9 +1179,7 @@ async def detect_words(
|
|||||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||||
entries = _fix_character_confusion(entries)
|
entries = _fix_character_confusion(entries)
|
||||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||||
# NOTE: _split_comma_entries disabled — word forms like "mouse, mice"
|
entries = _split_comma_entries(entries)
|
||||||
# / "Maus, Mäuse" belong together in one entry.
|
|
||||||
# entries = _split_comma_entries(entries)
|
|
||||||
entries = _attach_example_sentences(entries)
|
entries = _attach_example_sentences(entries)
|
||||||
word_result["vocab_entries"] = entries
|
word_result["vocab_entries"] = entries
|
||||||
# Also keep "entries" key for backwards compatibility
|
# Also keep "entries" key for backwards compatibility
|
||||||
@@ -1310,9 +1308,7 @@ async def _word_stream_generator(
|
|||||||
entries = _cells_to_vocab_entries(all_cells, columns_meta)
|
entries = _cells_to_vocab_entries(all_cells, columns_meta)
|
||||||
entries = _fix_character_confusion(entries)
|
entries = _fix_character_confusion(entries)
|
||||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||||
# NOTE: _split_comma_entries disabled — word forms like "mouse, mice"
|
entries = _split_comma_entries(entries)
|
||||||
# / "Maus, Mäuse" belong together in one entry.
|
|
||||||
# entries = _split_comma_entries(entries)
|
|
||||||
entries = _attach_example_sentences(entries)
|
entries = _attach_example_sentences(entries)
|
||||||
word_result["vocab_entries"] = entries
|
word_result["vocab_entries"] = entries
|
||||||
word_result["entries"] = entries
|
word_result["entries"] = entries
|
||||||
|
|||||||
Reference in New Issue
Block a user