Improve IPA continuation row detection with phonetic heuristics
Strip IPA brackets that fix_cell_phonetics may have added for short
dictionary words (e.g. "si" → "[si]") before checking if the row is
a garbled phonetic continuation. Detect phonetic text by presence of
':' (length marks), leading apostrophe (stress marks), or absence of
any word with ≥3 letters.
Fixes Row 39 ("si: [si] — So: - si:n") not being removed.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
@@ -1221,9 +1222,24 @@ async def build_grid(session_id: str):
|
|||||||
]
|
]
|
||||||
if en_cells and not other_cells:
|
if en_cells and not other_cells:
|
||||||
en_text = en_cells[0].get("text", "")
|
en_text = en_cells[0].get("text", "")
|
||||||
# No IPA brackets → phonetics not recognized →
|
# Strip any IPA brackets that fix_cell_phonetics
|
||||||
# this is a garbled IPA continuation row
|
# may have added for short dictionary matches
|
||||||
if "[" not in en_text:
|
# (e.g. "si" → "[si]") to check underlying text.
|
||||||
|
text_bare = re.sub(r'\[[^\]]*\]', '', en_text).strip()
|
||||||
|
# Garbled IPA typically contains ':' (length mark)
|
||||||
|
# or starts with ' (stress mark), and has no word
|
||||||
|
# with ≥3 letters that could be a real headword.
|
||||||
|
has_headword = any(
|
||||||
|
len(re.sub(r'[^a-zA-Z]', '', w)) >= 3
|
||||||
|
for w in text_bare.split()
|
||||||
|
) if text_bare else False
|
||||||
|
looks_phonetic = (
|
||||||
|
':' in text_bare
|
||||||
|
or text_bare.startswith("'")
|
||||||
|
or text_bare.startswith("\u2019")
|
||||||
|
or not has_headword
|
||||||
|
)
|
||||||
|
if looks_phonetic:
|
||||||
ipa_cont_rows.add(ri)
|
ipa_cont_rows.add(ri)
|
||||||
if ipa_cont_rows:
|
if ipa_cont_rows:
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
|
|||||||
Reference in New Issue
Block a user