From c894a0feebec9cc57a6afe0bc3d0de40ff303268 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 12:08:21 +0100 Subject: [PATCH] Improve IPA continuation row detection with phonetic heuristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strip IPA brackets that fix_cell_phonetics may have added for short dictionary words (e.g. "si" → "[si]") before checking if the row is a garbled phonetic continuation. Detect phonetic text by presence of ':' (length marks), leading apostrophe (stress marks), or absence of any word with ≥3 letters. Fixes Row 39 ("si: [si] — So: - si:n") not being removed. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 94282e4..36ef379 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -12,6 +12,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging +import re import time from typing import Any, Dict, List, Optional @@ -1221,9 +1222,24 @@ async def build_grid(session_id: str): ] if en_cells and not other_cells: en_text = en_cells[0].get("text", "") - # No IPA brackets → phonetics not recognized → - # this is a garbled IPA continuation row - if "[" not in en_text: + # Strip any IPA brackets that fix_cell_phonetics + # may have added for short dictionary matches + # (e.g. "si" → "[si]") to check underlying text. + text_bare = re.sub(r'\[[^\]]*\]', '', en_text).strip() + # Garbled IPA typically contains ':' (length mark) + # or starts with ' (stress mark), and has no word + # with ≥3 letters that could be a real headword. + has_headword = any( + len(re.sub(r'[^a-zA-Z]', '', w)) >= 3 + for w in text_bare.split() + ) if text_bare else False + looks_phonetic = ( + ':' in text_bare + or text_bare.startswith("'") + or text_bare.startswith("\u2019") + or not has_headword + ) + if looks_phonetic: ipa_cont_rows.add(ri) if ipa_cont_rows: for z in zones_data: