Fix IPA stripping digits after headwords (Theme 1 → Theme)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 46s
CI / test-go-edu-search (push) Successful in 42s
CI / test-python-klausur (push) Failing after 2m46s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 30s

_insert_missing_ipa stripped "1" from "Theme 1" because it treated
the digit as garbled OCR phonetics. Now treats pure digits/numbering
patterns (1, 2., 3)) as delimiters that stop the garble-stripping.

Also fixes _has_non_dict_trailing which incorrectly flagged "Theme 1"
as having non-dictionary trailing text.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-11 22:13:45 +02:00
parent 2e42167c73
commit cde13c9623

View File

@@ -1182,6 +1182,10 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
if wj in ('', '', '-', '/', '|', ',', ';'): if wj in ('', '', '-', '/', '|', ',', ';'):
kept.extend(words[j:]) kept.extend(words[j:])
break break
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
if re.match(r'^[\d.)\-]+$', wj):
kept.extend(words[j:])
break
# Starts with uppercase — likely German or proper noun # Starts with uppercase — likely German or proper noun
clean_j = re.sub(r'[^a-zA-Z]', '', wj) clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper(): if clean_j and clean_j[0].isupper():
@@ -1243,6 +1247,9 @@ def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
wj = words[j] wj = words[j]
if wj in ('', '', '-', '/', '|', ',', ';'): if wj in ('', '', '-', '/', '|', ',', ';'):
return False return False
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
if re.match(r'^[\d.)\-]+$', wj):
return False
clean_j = re.sub(r'[^a-zA-Z]', '', wj) clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper(): if clean_j and clean_j[0].isupper():
return False return False