From 8ef4c089cfc2b8e1b5e52bfcfb63ad3efb4c0ab3 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 18 Mar 2026 12:05:38 +0100
Subject: [PATCH] Remove IPA continuation rows and support hyphenated word
 lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- grid_editor_api: After IPA correction, detect rows containing only
  garbled phonetics in the English column (no German translation, no
  IPA brackets inserted). These are wrap-around lines where printed
  IPA extends to the line below the headword. Remove them since the
  headword row already has correct IPA.
- cv_ocr_engines: _insert_missing_ipa now tries dehyphenated form
  as fallback (e.g. "second-hand" → "secondhand") for dictionary
  lookup, fixing IPA insertion for compound words.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py  |  3 ++
 klausur-service/backend/grid_editor_api.py | 44 ++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index 51d4d54..67648c3 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -1026,6 +1026,9 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
         if clean.lower() in _GRAMMAR_BRACKET_WORDS:
             continue
         ipa = _lookup_ipa(clean, pronunciation)
+        # Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
+        if not ipa and '-' in clean:
+            ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
         if ipa:
             words[i] = f"{w} [{ipa}]"
             # Strip garbled OCR phonetics after the IPA bracket.
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 8d8a937..94282e4 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1196,6 +1196,50 @@ async def build_grid(session_id: str):
             if orig:
                 cell["col_type"] = orig
 
+        # 5d. Remove IPA continuation rows — rows where the printed
+        # phonetic transcription wraps to a line below the headword.
+        # These rows have text only in the English column (+ margin
+        # noise) and fix_cell_phonetics did NOT insert IPA brackets
+        # (because there's no real English word to look up).
+        ipa_cont_rows: set = set()
+        for z in zones_data:
+            for row in z.get("rows", []):
+                ri = row["index"]
+                row_cells = [
+                    c for c in z.get("cells", [])
+                    if c.get("row_index") == ri
+                ]
+                en_cells = [
+                    c for c in row_cells
+                    if c.get("col_type") == en_col_type
+                ]
+                # Other cells with ≥3 chars (ignore margin noise)
+                other_cells = [
+                    c for c in row_cells
+                    if c.get("col_type") != en_col_type
+                    and len((c.get("text") or "").strip()) >= 3
+                ]
+                if en_cells and not other_cells:
+                    en_text = en_cells[0].get("text", "")
+                    # No IPA brackets → phonetics not recognized →
+                    # this is a garbled IPA continuation row
+                    if "[" not in en_text:
+                        ipa_cont_rows.add(ri)
+        if ipa_cont_rows:
+            for z in zones_data:
+                z["rows"] = [
+                    r for r in z.get("rows", [])
+                    if r["index"] not in ipa_cont_rows
+                ]
+                z["cells"] = [
+                    c for c in z.get("cells", [])
+                    if c.get("row_index") not in ipa_cont_rows
+                ]
+            logger.info(
+                "removed %d IPA continuation rows: %s",
+                len(ipa_cont_rows), sorted(ipa_cont_rows),
+            )
+
     duration = time.time() - t0
 
     # 6. Build result