fix(ocr-pipeline): merge phonetic-only rows and fix bracket noise filter

Two fixes: 1. Tokens ending with ] (e.g. "serva]") were stripped by the noise filter because ] was not in the allowed punctuation list. 2. Rows containing only phonetic transcription (e.g. ['mani serva]) are now merged into the previous vocab entry instead of creating a separate (invalid) entry. This prevents the LLM from trying to "correct" phonetic fragments. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 14:14:20 +01:00
parent 650f15bc1b
commit c3a924a620
1 changed files with 81 additions and 1 deletions
@@ -3281,9 +3281,11 @@ def _is_noise_tail_token(token: str) -> bool:
    if t in ('...', '…'):
        return False

-    # Keep phonetic brackets: [eg], [maus], ["a:mand], etc.
+    # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
    if t.startswith('[') or t.startswith('["') or t.startswith("['"):
        return False
+    if t.endswith(']'):
+        return False

    # Pure non-alpha → noise ("3", ")", "|")
    alpha_chars = _RE_ALPHA.findall(t)
@@ -3792,6 +3794,81 @@ def _cells_to_vocab_entries(
    return entries


+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+    """Check if text consists only of phonetic transcription.
+
+    Phonetic-only patterns:
+      ['mani serva]   →  True
+      [dɑːns]         →  True
+      ["a:mand]       →  True
+      almond ['a:mand] → False (has real word before bracket)
+      Mandel           → False
+    """
+    t = text.strip()
+    if not t:
+        return False
+    # Must contain at least one bracket
+    if '[' not in t and ']' not in t:
+        return False
+    # Remove all bracket content and surrounding punctuation/whitespace
+    without_brackets = re.sub(r"\[.*?\]", '', t)
+    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+    # If nothing meaningful remains, it's phonetic-only
+    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+    return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows that contain only phonetic transcription into previous entry.
+
+    In dictionary pages, phonetic transcription sometimes wraps to the next
+    row.  E.g.:
+      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
+      Row 29: EN="['mani serva]"       DE=""
+
+    Row 29 is phonetic-only → merge into row 28's EN field.
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+        if merged and _is_phonetic_only_text(en) and not de:
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            # Append phonetic to previous entry's EN
+            if prev_en:
+                prev['english'] = prev_en + ' ' + en
+            else:
+                prev['english'] = en
+            # If there was an example, append to previous too
+            if ex:
+                prev_ex = (prev.get('example') or '').strip()
+                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+            logger.debug(
+                f"Merged phonetic row {entry.get('row_index')} "
+                f"into previous entry: {prev['english']!r}"
+            )
+            continue
+
+        merged.append(entry)
+
+    return merged
+
+
 def build_word_grid(
    ocr_img: np.ndarray,
    column_regions: List[PageRegion],
@@ -3843,6 +3920,9 @@ def build_word_grid(
    # --- Post-processing pipeline (deterministic, no LLM) ---
    n_raw = len(entries)

+    # 0. Merge phonetic-only continuation rows into previous entry
+    entries = _merge_phonetic_continuation_rows(entries)
+
    # 1. Fix character confusion (I/1/l based on context)
    entries = _fix_character_confusion(entries)