debug: add phonetic bracket replacement logging
This commit is contained in:
@@ -4273,14 +4273,22 @@ def _fix_phonetic_brackets(
|
|||||||
if not IPA_AVAILABLE:
|
if not IPA_AVAILABLE:
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
replaced_count = 0
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
for field in ('english', 'german', 'example'):
|
for field in ('english', 'german', 'example'):
|
||||||
text = entry.get(field, '') or ''
|
text = entry.get(field, '') or ''
|
||||||
# Check for any bracket type — Tesseract garbles [ into { or (
|
# Check for any bracket type — Tesseract garbles [ into { or (
|
||||||
if not any(ch in text for ch in '[{('):
|
if not any(ch in text for ch in '[{('):
|
||||||
continue
|
continue
|
||||||
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||||
|
if new_text != text:
|
||||||
|
logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'")
|
||||||
|
replaced_count += 1
|
||||||
|
else:
|
||||||
|
logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
|
||||||
|
entry[field] = new_text
|
||||||
|
|
||||||
|
logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
@@ -4325,21 +4333,26 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
|
|||||||
def replacer(match):
|
def replacer(match):
|
||||||
word = match.group(1)
|
word = match.group(1)
|
||||||
bracket_content = match.group(2).strip()
|
bracket_content = match.group(2).strip()
|
||||||
|
full_match = match.group(0)
|
||||||
|
|
||||||
# Skip if bracket content looks like regular text (multiple words)
|
# Skip if bracket content looks like regular text (multiple words)
|
||||||
if len(bracket_content.split()) > 3:
|
if len(bracket_content.split()) > 3:
|
||||||
return match.group(0)
|
logger.info(f" phonetic replacer: SKIP (too many words) '{full_match}'")
|
||||||
|
return full_match
|
||||||
|
|
||||||
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
|
# Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
|
||||||
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
|
# Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
|
||||||
if _is_meaningful_bracket_content(bracket_content):
|
if _is_meaningful_bracket_content(bracket_content):
|
||||||
return match.group(0)
|
logger.info(f" phonetic replacer: SKIP (meaningful content) '{full_match}'")
|
||||||
|
return full_match
|
||||||
|
|
||||||
# Look up in IPA dictionary
|
# Look up in IPA dictionary
|
||||||
ipa = _lookup_ipa(word, pronunciation)
|
ipa = _lookup_ipa(word, pronunciation)
|
||||||
if not ipa:
|
if not ipa:
|
||||||
return match.group(0) # Keep original
|
logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
|
||||||
|
return full_match # Keep original
|
||||||
|
|
||||||
|
logger.info(f" phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'")
|
||||||
return f"{word} [{ipa}]"
|
return f"{word} [{ipa}]"
|
||||||
|
|
||||||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||||
|
|||||||
Reference in New Issue
Block a user