From c894a0feebec9cc57a6afe0bc3d0de40ff303268 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 18 Mar 2026 12:08:21 +0100
Subject: [PATCH] Improve IPA continuation row detection with phonetic
 heuristics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Strip IPA brackets that fix_cell_phonetics may have added for short
dictionary words (e.g. "si" → "[si]") before checking if the row is
a garbled phonetic continuation. Detect phonetic text by presence of
':' (length marks), leading apostrophe (stress marks), or absence of
any word with ≥3 letters.

Fixes Row 39 ("si: [si] — So: - si:n") not being removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 94282e4..36ef379 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -12,6 +12,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 
 import logging
+import re
 import time
 from typing import Any, Dict, List, Optional
 
@@ -1221,9 +1222,24 @@ async def build_grid(session_id: str):
                 ]
                 if en_cells and not other_cells:
                     en_text = en_cells[0].get("text", "")
-                    # No IPA brackets → phonetics not recognized →
-                    # this is a garbled IPA continuation row
-                    if "[" not in en_text:
+                    # Strip any IPA brackets that fix_cell_phonetics
+                    # may have added for short dictionary matches
+                    # (e.g. "si" → "[si]") to check underlying text.
+                    text_bare = re.sub(r'\[[^\]]*\]', '', en_text).strip()
+                    # Garbled IPA typically contains ':' (length mark)
+                    # or starts with ' (stress mark), and has no word
+                    # with ≥3 letters that could be a real headword.
+                    has_headword = any(
+                        len(re.sub(r'[^a-zA-Z]', '', w)) >= 3
+                        for w in text_bare.split()
+                    ) if text_bare else False
+                    looks_phonetic = (
+                        ':' in text_bare
+                        or text_bare.startswith("'")
+                        or text_bare.startswith("\u2019")
+                        or not has_headword
+                    )
+                    if looks_phonetic:
                         ipa_cont_rows.add(ri)
         if ipa_cont_rows:
             for z in zones_data: