debug: add phonetic bracket replacement logging

2026-03-04 23:01:01 +01:00
parent 6ad4b84584
commit 99573a46ef
1 changed files with 17 additions and 4 deletions
@@ -4273,14 +4273,22 @@ def _fix_phonetic_brackets(
    if not IPA_AVAILABLE:
        return entries
    replaced_count = 0
    for entry in entries:
        for field in ('english', 'german', 'example'):
            text = entry.get(field, '') or ''
            # Check for any bracket type — Tesseract garbles [ into { or (
            if not any(ch in text for ch in '[{('):
                continue
-            entry[field] = _replace_phonetics_in_text(text, pronunciation)
+            new_text = _replace_phonetics_in_text(text, pronunciation)
            if new_text != text:
                logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'")
                replaced_count += 1
            else:
                logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
            entry[field] = new_text
    logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
    return entries
@@ -4325,21 +4333,26 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
    def replacer(match):
        word = match.group(1)
        bracket_content = match.group(2).strip()
        full_match = match.group(0)
        # Skip if bracket content looks like regular text (multiple words)
        if len(bracket_content.split()) > 3:
-            return match.group(0)
+            logger.info(f"  phonetic replacer: SKIP (too many words) '{full_match}'")
            return full_match
        # Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
        # Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
        if _is_meaningful_bracket_content(bracket_content):
-            return match.group(0)
+            logger.info(f"  phonetic replacer: SKIP (meaningful content) '{full_match}'")
            return full_match
        # Look up in IPA dictionary
        ipa = _lookup_ipa(word, pronunciation)
        if not ipa:
-            return match.group(0)  # Keep original
+            logger.info(f"  phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'")
            return full_match  # Keep original
        logger.info(f"  phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'")
        return f"{word} [{ipa}]"
    return _PHONETIC_BRACKET_RE.sub(replacer, text)