Fix IPA stripping digits after headwords (Theme 1 → Theme)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 46s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m46s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 30s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 46s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m46s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 30s
_insert_missing_ipa stripped "1" from "Theme 1" because it treated the digit as garbled OCR phonetics. Now treats pure digits/numbering patterns (1, 2., 3)) as delimiters that stop the garble-stripping. Also fixes _has_non_dict_trailing which incorrectly flagged "Theme 1" as having non-dictionary trailing text. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1182,6 +1182,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
|||||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||||
kept.extend(words[j:])
|
kept.extend(words[j:])
|
||||||
break
|
break
|
||||||
|
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
|
||||||
|
if re.match(r'^[\d.)\-]+$', wj):
|
||||||
|
kept.extend(words[j:])
|
||||||
|
break
|
||||||
# Starts with uppercase — likely German or proper noun
|
# Starts with uppercase — likely German or proper noun
|
||||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||||
if clean_j and clean_j[0].isupper():
|
if clean_j and clean_j[0].isupper():
|
||||||
@@ -1243,6 +1247,9 @@ def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
|
|||||||
wj = words[j]
|
wj = words[j]
|
||||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||||
return False
|
return False
|
||||||
|
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
|
||||||
|
if re.match(r'^[\d.)\-]+$', wj):
|
||||||
|
return False
|
||||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||||
if clean_j and clean_j[0].isupper():
|
if clean_j and clean_j[0].isupper():
|
||||||
return False
|
return False
|
||||||
|
|||||||
Reference in New Issue
Block a user