Fix garbled IPA detection for bracket-notation like [n, nn] and [1uedtX,1]
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-agent-core (push) Has been cancelled
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-klausur (push) Has been cancelled
- Detect bracketed text without real IPA symbols as garbled OCR phonetics - Allow IPA continuation fix even when other columns have content (for rows where EN cell is clearly garbled bracketed IPA) - Strip parenthetical grammar annotations like (no pl) from headword before IPA lookup in fix_ipa_continuation_cell Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -993,6 +993,18 @@ def _text_has_garbled_ipa(text: str) -> bool:
|
||||
it must only insert IPA to *replace* garbled phonetics that are already
|
||||
in the text — never to ADD phonetics where none existed on the page.
|
||||
"""
|
||||
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
|
||||
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
|
||||
stripped = text.strip()
|
||||
if stripped.startswith('[') and stripped.endswith(']'):
|
||||
inner = stripped[1:-1]
|
||||
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
|
||||
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
# Not a valid dictionary-style bracket like "(no pl)" — those
|
||||
# use parentheses, not square brackets. Square brackets with
|
||||
# no IPA chars are garbled phonetics.
|
||||
return True
|
||||
|
||||
for w in text.strip().split():
|
||||
# Skip delimiters and very short tokens
|
||||
if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
@@ -1238,8 +1250,10 @@ def fix_ipa_continuation_cell(
|
||||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||||
return garbled_text
|
||||
|
||||
# Strip existing IPA brackets from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text).strip()
|
||||
# Strip existing IPA brackets and parenthetical grammar annotations
|
||||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||||
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
|
||||
if not clean_hw:
|
||||
return garbled_text
|
||||
|
||||
|
||||
Reference in New Issue
Block a user