fix(ocr-pipeline): merge phonetic-only rows and fix bracket noise filter

Two fixes: 1. Tokens ending with ] (e.g. "serva]") were stripped by the noise filter because ] was not in the allowed punctuation list. 2. Rows containing only phonetic transcription (e.g. ['mani serva]) are now merged into the previous vocab entry instead of creating a separate (invalid) entry. This prevents the LLM from trying to "correct" phonetic fragments. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 14:14:20 +01:00
parent 650f15bc1b
commit c3a924a620
1 changed files with 81 additions and 1 deletions
@@ -3281,9 +3281,11 @@ def _is_noise_tail_token(token: str) -> bool:
    if t in ('...', '…'):
        return False
-    # Keep phonetic brackets: [eg], [maus], ["a:mand], etc.
+    # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
    if t.startswith('[') or t.startswith('["') or t.startswith("['"):
        return False
    if t.endswith(']'):
        return False
    # Pure non-alpha → noise ("3", ")", "|")
    alpha_chars = _RE_ALPHA.findall(t)
@@ -3792,6 +3794,81 @@ def _cells_to_vocab_entries(
    return entries
 # Regex: line starts with phonetic bracket content only (no real word before it)
 _PHONETIC_ONLY_RE = re.compile(
    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
 )
 def _is_phonetic_only_text(text: str) -> bool:
    """Check if text consists only of phonetic transcription.
    Phonetic-only patterns:
      ['mani serva]   →  True
      [dɑːns]         →  True
      ["a:mand]       →  True
      almond ['a:mand] → False (has real word before bracket)
      Mandel           → False
    """
    t = text.strip()
    if not t:
        return False
    # Must contain at least one bracket
    if '[' not in t and ']' not in t:
        return False
    # Remove all bracket content and surrounding punctuation/whitespace
    without_brackets = re.sub(r"\[.*?\]", '', t)
    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
    # If nothing meaningful remains, it's phonetic-only
    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
    return len(alpha_remaining) < 2
 def _merge_phonetic_continuation_rows(
    entries: List[Dict[str, Any]],
 ) -> List[Dict[str, Any]]:
    """Merge rows that contain only phonetic transcription into previous entry.
    In dictionary pages, phonetic transcription sometimes wraps to the next
    row.  E.g.:
      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
      Row 29: EN="['mani serva]"       DE=""
    Row 29 is phonetic-only → merge into row 28's EN field.
    """
    if len(entries) < 2:
        return entries
    merged: List[Dict[str, Any]] = []
    for entry in entries:
        en = (entry.get('english') or '').strip()
        de = (entry.get('german') or '').strip()
        ex = (entry.get('example') or '').strip()
        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
        if merged and _is_phonetic_only_text(en) and not de:
            prev = merged[-1]
            prev_en = (prev.get('english') or '').strip()
            # Append phonetic to previous entry's EN
            if prev_en:
                prev['english'] = prev_en + ' ' + en
            else:
                prev['english'] = en
            # If there was an example, append to previous too
            if ex:
                prev_ex = (prev.get('example') or '').strip()
                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
            logger.debug(
                f"Merged phonetic row {entry.get('row_index')} "
                f"into previous entry: {prev['english']!r}"
            )
            continue
        merged.append(entry)
    return merged
 def build_word_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -3843,6 +3920,9 @@ def build_word_grid(
    # --- Post-processing pipeline (deterministic, no LLM) ---
    n_raw = len(entries)
    # 0. Merge phonetic-only continuation rows into previous entry
    entries = _merge_phonetic_continuation_rows(entries)
    # 1. Fix character confusion (I/1/l based on context)
    entries = _fix_character_confusion(entries)