debug: add phonetic bracket replacement logging

This commit is contained in:
Benjamin Admin
2026-03-04 23:01:01 +01:00
parent 6ad4b84584
commit 99573a46ef

View File

@@ -4273,14 +4273,22 @@ def _fix_phonetic_brackets(
if not IPA_AVAILABLE: if not IPA_AVAILABLE:
return entries return entries
replaced_count = 0
for entry in entries: for entry in entries:
for field in ('english', 'german', 'example'): for field in ('english', 'german', 'example'):
text = entry.get(field, '') or '' text = entry.get(field, '') or ''
# Check for any bracket type — Tesseract garbles [ into { or ( # Check for any bracket type — Tesseract garbles [ into { or (
if not any(ch in text for ch in '[{('): if not any(ch in text for ch in '[{('):
continue continue
entry[field] = _replace_phonetics_in_text(text, pronunciation) new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text:
logger.info(f"_fix_phonetic_brackets: {field} '{text}''{new_text}'")
replaced_count += 1
else:
logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
entry[field] = new_text
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
return entries return entries
@@ -4325,21 +4333,26 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
def replacer(match): def replacer(match):
word = match.group(1) word = match.group(1)
bracket_content = match.group(2).strip() bracket_content = match.group(2).strip()
full_match = match.group(0)
# Skip if bracket content looks like regular text (multiple words) # Skip if bracket content looks like regular text (multiple words)
if len(bracket_content.split()) > 3: if len(bracket_content.split()) > 3:
return match.group(0) logger.info(f" phonetic replacer: SKIP (too many words) '{full_match}'")
return full_match
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen, # Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA. # Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
if _is_meaningful_bracket_content(bracket_content): if _is_meaningful_bracket_content(bracket_content):
return match.group(0) logger.info(f" phonetic replacer: SKIP (meaningful content) '{full_match}'")
return full_match
# Look up in IPA dictionary # Look up in IPA dictionary
ipa = _lookup_ipa(word, pronunciation) ipa = _lookup_ipa(word, pronunciation)
if not ipa: if not ipa:
return match.group(0) # Keep original logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
return full_match # Keep original
logger.info(f" phonetic replacer: REPLACE '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]" return f"{word} [{ipa}]"
return _PHONETIC_BRACKET_RE.sub(replacer, text) return _PHONETIC_BRACKET_RE.sub(replacer, text)