From c3a924a62079388e1614ccfa6650672cad79f3b1 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 14:14:20 +0100
Subject: [PATCH] fix(ocr-pipeline): merge phonetic-only rows and fix bracket
 noise filter

Two fixes:
1. Tokens ending with ] (e.g. "serva]") were stripped by the noise
   filter because ] was not in the allowed punctuation list.
2. Rows containing only phonetic transcription (e.g. ['mani serva])
   are now merged into the previous vocab entry instead of creating
   a separate (invalid) entry. This prevents the LLM from trying
   to "correct" phonetic fragments.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 82 +++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index d906c0a..700bc9f 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3281,9 +3281,11 @@ def _is_noise_tail_token(token: str) -> bool:
     if t in ('...', '…'):
         return False
 
-    # Keep phonetic brackets: [eg], [maus], ["a:mand], etc.
+    # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
     if t.startswith('[') or t.startswith('["') or t.startswith("['"):
         return False
+    if t.endswith(']'):
+        return False
 
     # Pure non-alpha → noise ("3", ")", "|")
     alpha_chars = _RE_ALPHA.findall(t)
@@ -3792,6 +3794,81 @@ def _cells_to_vocab_entries(
     return entries
 
 
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+    """Check if text consists only of phonetic transcription.
+
+    Phonetic-only patterns:
+      ['mani serva]   →  True
+      [dɑːns]         →  True
+      ["a:mand]       →  True
+      almond ['a:mand] → False (has real word before bracket)
+      Mandel           → False
+    """
+    t = text.strip()
+    if not t:
+        return False
+    # Must contain at least one bracket
+    if '[' not in t and ']' not in t:
+        return False
+    # Remove all bracket content and surrounding punctuation/whitespace
+    without_brackets = re.sub(r"\[.*?\]", '', t)
+    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+    # If nothing meaningful remains, it's phonetic-only
+    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+    return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows that contain only phonetic transcription into previous entry.
+
+    In dictionary pages, phonetic transcription sometimes wraps to the next
+    row.  E.g.:
+      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
+      Row 29: EN="['mani serva]"       DE=""
+
+    Row 29 is phonetic-only → merge into row 28's EN field.
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+        if merged and _is_phonetic_only_text(en) and not de:
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            # Append phonetic to previous entry's EN
+            if prev_en:
+                prev['english'] = prev_en + ' ' + en
+            else:
+                prev['english'] = en
+            # If there was an example, append to previous too
+            if ex:
+                prev_ex = (prev.get('example') or '').strip()
+                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+            logger.debug(
+                f"Merged phonetic row {entry.get('row_index')} "
+                f"into previous entry: {prev['english']!r}"
+            )
+            continue
+
+        merged.append(entry)
+
+    return merged
+
+
 def build_word_grid(
     ocr_img: np.ndarray,
     column_regions: List[PageRegion],
@@ -3843,6 +3920,9 @@ def build_word_grid(
     # --- Post-processing pipeline (deterministic, no LLM) ---
     n_raw = len(entries)
 
+    # 0. Merge phonetic-only continuation rows into previous entry
+    entries = _merge_phonetic_continuation_rows(entries)
+
     # 1. Fix character confusion (I/1/l based on context)
     entries = _fix_character_confusion(entries)