diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 2111893..c662be1 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -993,6 +993,18 @@ def _text_has_garbled_ipa(text: str) -> bool: it must only insert IPA to *replace* garbled phonetics that are already in the text — never to ADD phonetics where none existed on the page. """ + # Bracketed text that doesn't contain valid IPA symbols is garbled OCR + # of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]". + stripped = text.strip() + if stripped.startswith('[') and stripped.endswith(']'): + inner = stripped[1:-1] + # Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ) + if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'): + # Not a valid dictionary-style bracket like "(no pl)" — those + # use parentheses, not square brackets. Square brackets with + # no IPA chars are garbled phonetics. + return True + for w in text.strip().split(): # Skip delimiters and very short tokens if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'): @@ -1238,8 +1250,10 @@ def fix_ipa_continuation_cell( if not IPA_AVAILABLE or not garbled_text or not headword_text: return garbled_text - # Strip existing IPA brackets from headword text - clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip() + # Strip existing IPA brackets and parenthetical grammar annotations + # like "(no pl)", "(sth)", "(sb)" from headword text + clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text) + clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip() if not clean_hw: return garbled_text diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 2a2d479..15e43ec 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1616,9 +1616,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # 5d. Fix IPA continuation rows — rows where the printed # phonetic transcription wraps to a line below the headword. - # These contain only garbled IPA in the EN column and nothing - # in other columns. Replace garbled text with proper IPA - # looked up from the headword in the previous row. + # These contain garbled IPA in the EN column. Replace garbled + # text with proper IPA looked up from the headword in the + # previous row. ipa_cont_fixed = 0 for z in zones_data: rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) @@ -1630,13 +1630,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: c for c in row_cells if c.get("col_type") == en_col_type ] - # Other cells with ≥3 chars (ignore margin noise) - other_cells = [ - c for c in row_cells - if c.get("col_type") != en_col_type - and len((c.get("text") or "").strip()) >= 3 - ] - if not en_cells or other_cells: + if not en_cells: continue en_text = en_cells[0].get("text", "") if not _text_has_garbled_ipa(en_text): @@ -1644,6 +1638,21 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # Already has proper IPA brackets → already fixed if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', en_text): continue + # When the EN cell does NOT look obviously garbled + # (e.g. bracketed non-IPA), require that other columns + # are empty — otherwise it's a normal content row. + en_stripped = en_text.strip() + is_bracket_garbled = ( + en_stripped.startswith('[') and en_stripped.endswith(']') + ) + if not is_bracket_garbled: + other_cells = [ + c for c in row_cells + if c.get("col_type") != en_col_type + and len((c.get("text") or "").strip()) >= 3 + ] + if other_cells: + continue # Find headword in previous row if idx == 0: continue diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index c9c1284..24f8adb 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -19,6 +19,7 @@ from grid_editor_api import ( _detect_header_rows, _detect_heading_rows_by_color, ) +from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell # --------------------------------------------------------------------------- @@ -452,3 +453,44 @@ class TestDetectHeaderRowsSkipFlag: ] headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True) assert 0 not in headers + + +# --------------------------------------------------------------------------- +# _text_has_garbled_ipa + fix_ipa_continuation_cell +# --------------------------------------------------------------------------- + +class TestGarbledIpaDetection: + """Test detection and fixing of garbled IPA in bracket notation.""" + + def test_bracket_garbled_no_ipa_chars(self): + """'[n, nn]' — brackets with no real IPA chars → garbled.""" + assert _text_has_garbled_ipa("[n, nn]") is True + + def test_bracket_garbled_alphanumeric(self): + """'[1uedtX,1]' — brackets with digits/letters → garbled.""" + assert _text_has_garbled_ipa("[1uedtX,1]") is True + + def test_bracket_valid_ipa_not_garbled(self): + """'[ɪkwˈɪpmənt]' — brackets with real IPA → not garbled.""" + assert _text_has_garbled_ipa("[ɪkwˈɪpmənt]") is False + + def test_no_brackets_normal_word(self): + """'equipment' — normal word → not garbled.""" + assert _text_has_garbled_ipa("equipment") is False + + def test_fix_continuation_united_kingdom(self): + """IPA continuation for 'the United Kingdom' → proper IPA.""" + fixed = fix_ipa_continuation_cell( + "[n, nn]", "the United Kingdom", pronunciation="british", + ) + # Should contain proper IPA, not the garbled text + assert fixed != "[n, nn]" + assert "kˈɪŋdəm" in fixed # Kingdom IPA + + def test_fix_continuation_equipment(self): + """IPA continuation for 'equipment' → proper IPA.""" + fixed = fix_ipa_continuation_cell( + "[1uedtX,1]", "equipment (no pl)", pronunciation="british", + ) + assert fixed != "[1uedtX,1]" + assert "ɪkwˈɪpmənt" in fixed # equipment IPA