Improve IPA continuation row detection with phonetic heuristics
Strip IPA brackets that fix_cell_phonetics may have added for short
dictionary words (e.g. "si" → "[si]") before checking if the row is
a garbled phonetic continuation. Detect phonetic text by presence of
':' (length marks), leading apostrophe (stress marks), or absence of
any word with ≥3 letters.
Fixes Row 39 ("si: [si] — So: - si:n") not being removed.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@@ -1221,9 +1222,24 @@ async def build_grid(session_id: str):
|
||||
]
|
||||
if en_cells and not other_cells:
|
||||
en_text = en_cells[0].get("text", "")
|
||||
# No IPA brackets → phonetics not recognized →
|
||||
# this is a garbled IPA continuation row
|
||||
if "[" not in en_text:
|
||||
# Strip any IPA brackets that fix_cell_phonetics
|
||||
# may have added for short dictionary matches
|
||||
# (e.g. "si" → "[si]") to check underlying text.
|
||||
text_bare = re.sub(r'\[[^\]]*\]', '', en_text).strip()
|
||||
# Garbled IPA typically contains ':' (length mark)
|
||||
# or starts with ' (stress mark), and has no word
|
||||
# with ≥3 letters that could be a real headword.
|
||||
has_headword = any(
|
||||
len(re.sub(r'[^a-zA-Z]', '', w)) >= 3
|
||||
for w in text_bare.split()
|
||||
) if text_bare else False
|
||||
looks_phonetic = (
|
||||
':' in text_bare
|
||||
or text_bare.startswith("'")
|
||||
or text_bare.startswith("\u2019")
|
||||
or not has_headword
|
||||
)
|
||||
if looks_phonetic:
|
||||
ipa_cont_rows.add(ri)
|
||||
if ipa_cont_rows:
|
||||
for z in zones_data:
|
||||
|
||||
Reference in New Issue
Block a user