From c3a924a62079388e1614ccfa6650672cad79f3b1 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 14:14:20 +0100 Subject: [PATCH] fix(ocr-pipeline): merge phonetic-only rows and fix bracket noise filter Two fixes: 1. Tokens ending with ] (e.g. "serva]") were stripped by the noise filter because ] was not in the allowed punctuation list. 2. Rows containing only phonetic transcription (e.g. ['mani serva]) are now merged into the previous vocab entry instead of creating a separate (invalid) entry. This prevents the LLM from trying to "correct" phonetic fragments. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 82 +++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index d906c0a..700bc9f 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3281,9 +3281,11 @@ def _is_noise_tail_token(token: str) -> bool: if t in ('...', '…'): return False - # Keep phonetic brackets: [eg], [maus], ["a:mand], etc. + # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc. if t.startswith('[') or t.startswith('["') or t.startswith("['"): return False + if t.endswith(']'): + return False # Pure non-alpha → noise ("3", ")", "|") alpha_chars = _RE_ALPHA.findall(t) @@ -3792,6 +3794,81 @@ def _cells_to_vocab_entries( return entries +# Regex: line starts with phonetic bracket content only (no real word before it) +_PHONETIC_ONLY_RE = re.compile( + r'''^\s*[\[\('"]*[^\]]*[\])\s]*$''' +) + + +def _is_phonetic_only_text(text: str) -> bool: + """Check if text consists only of phonetic transcription. + + Phonetic-only patterns: + ['mani serva] → True + [dɑːns] → True + ["a:mand] → True + almond ['a:mand] → False (has real word before bracket) + Mandel → False + """ + t = text.strip() + if not t: + return False + # Must contain at least one bracket + if '[' not in t and ']' not in t: + return False + # Remove all bracket content and surrounding punctuation/whitespace + without_brackets = re.sub(r"\[.*?\]", '', t) + without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets) + # If nothing meaningful remains, it's phonetic-only + alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets)) + return len(alpha_remaining) < 2 + + +def _merge_phonetic_continuation_rows( + entries: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Merge rows that contain only phonetic transcription into previous entry. + + In dictionary pages, phonetic transcription sometimes wraps to the next + row. E.g.: + Row 28: EN="it's a money-saver" DE="es spart Kosten" + Row 29: EN="['mani serva]" DE="" + + Row 29 is phonetic-only → merge into row 28's EN field. + """ + if len(entries) < 2: + return entries + + merged: List[Dict[str, Any]] = [] + for entry in entries: + en = (entry.get('english') or '').strip() + de = (entry.get('german') or '').strip() + ex = (entry.get('example') or '').strip() + + # Check if this entry is phonetic-only (EN has only phonetics, DE empty) + if merged and _is_phonetic_only_text(en) and not de: + prev = merged[-1] + prev_en = (prev.get('english') or '').strip() + # Append phonetic to previous entry's EN + if prev_en: + prev['english'] = prev_en + ' ' + en + else: + prev['english'] = en + # If there was an example, append to previous too + if ex: + prev_ex = (prev.get('example') or '').strip() + prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex + logger.debug( + f"Merged phonetic row {entry.get('row_index')} " + f"into previous entry: {prev['english']!r}" + ) + continue + + merged.append(entry) + + return merged + + def build_word_grid( ocr_img: np.ndarray, column_regions: List[PageRegion], @@ -3843,6 +3920,9 @@ def build_word_grid( # --- Post-processing pipeline (deterministic, no LLM) --- n_raw = len(entries) + # 0. Merge phonetic-only continuation rows into previous entry + entries = _merge_phonetic_continuation_rows(entries) + # 1. Fix character confusion (I/1/l based on context) entries = _fix_character_confusion(entries)