Improve IPA continuation row detection with phonetic heuristics

Strip IPA brackets that fix_cell_phonetics may have added for short
dictionary words (e.g. "si" → "[si]") before checking if the row is
a garbled phonetic continuation. Detect phonetic text by presence of
':' (length marks), leading apostrophe (stress marks), or absence of
any word with ≥3 letters.

Fixes Row 39 ("si: [si] — So: - si:n") not being removed.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-18 12:08:21 +01:00
parent 8ef4c089cf
commit c894a0feeb

View File

@@ -12,6 +12,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
import time
from typing import Any, Dict, List, Optional
@@ -1221,9 +1222,24 @@ async def build_grid(session_id: str):
]
if en_cells and not other_cells:
en_text = en_cells[0].get("text", "")
# No IPA brackets → phonetics not recognized →
# this is a garbled IPA continuation row
if "[" not in en_text:
# Strip any IPA brackets that fix_cell_phonetics
# may have added for short dictionary matches
# (e.g. "si" → "[si]") to check underlying text.
text_bare = re.sub(r'\[[^\]]*\]', '', en_text).strip()
# Garbled IPA typically contains ':' (length mark)
# or starts with ' (stress mark), and has no word
# with ≥3 letters that could be a real headword.
has_headword = any(
len(re.sub(r'[^a-zA-Z]', '', w)) >= 3
for w in text_bare.split()
) if text_bare else False
looks_phonetic = (
':' in text_bare
or text_bare.startswith("'")
or text_bare.startswith("\u2019")
or not has_headword
)
if looks_phonetic:
ipa_cont_rows.add(ri)
if ipa_cont_rows:
for z in zones_data: