Fix garbled IPA detection for bracket-notation like [n, nn] and [1uedtX,1]

- Detect bracketed text without real IPA symbols as garbled OCR phonetics - Allow IPA continuation fix even when other columns have content (for rows where EN cell is clearly garbled bracketed IPA) - Strip parenthetical grammar annotations like (no pl) from headword before IPA lookup in fix_ipa_continuation_cell Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 23:28:00 +01:00
parent 7750b2a05f
commit 6bfa9eed86
3 changed files with 77 additions and 12 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1616,9 +1616,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:

        # 5d. Fix IPA continuation rows — rows where the printed
        # phonetic transcription wraps to a line below the headword.
-        # These contain only garbled IPA in the EN column and nothing
-        # in other columns.  Replace garbled text with proper IPA
-        # looked up from the headword in the previous row.
+        # These contain garbled IPA in the EN column.  Replace garbled
+        # text with proper IPA looked up from the headword in the
+        # previous row.
        ipa_cont_fixed = 0
        for z in zones_data:
            rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
@@ -1630,13 +1630,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    c for c in row_cells
                    if c.get("col_type") == en_col_type
                ]
-                # Other cells with ≥3 chars (ignore margin noise)
-                other_cells = [
-                    c for c in row_cells
-                    if c.get("col_type") != en_col_type
-                    and len((c.get("text") or "").strip()) >= 3
-                ]
-                if not en_cells or other_cells:
+                if not en_cells:
                    continue
                en_text = en_cells[0].get("text", "")
                if not _text_has_garbled_ipa(en_text):
@@ -1644,6 +1638,21 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                # Already has proper IPA brackets → already fixed
                if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text):
                    continue
+                # When the EN cell does NOT look obviously garbled
+                # (e.g. bracketed non-IPA), require that other columns
+                # are empty — otherwise it's a normal content row.
+                en_stripped = en_text.strip()
+                is_bracket_garbled = (
+                    en_stripped.startswith('[') and en_stripped.endswith(']')
+                )
+                if not is_bracket_garbled:
+                    other_cells = [
+                        c for c in row_cells
+                        if c.get("col_type") != en_col_type
+                        and len((c.get("text") or "").strip()) >= 3
+                    ]
+                    if other_cells:
+                        continue
                # Find headword in previous row
                if idx == 0:
                    continue