fix: only replace phonetics in english field, fix grammar detection

- Only process 'english' field for IPA replacement. German and example fields contain meaningful parenthetical content like (gefrorenes Wasser), (sich beschweren) that must never be replaced. - Simplify _is_grammar_bracket_content: only known grammar particles (with, about/of, sth, etc.) are preserved. Removes the >= 4 chars heuristic that incorrectly preserved garbled IPA like [breik], [maus]. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 23:19:03 +01:00
parent edbdac3203
commit e6dc3fcdd7
1 changed files with 19 additions and 34 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -4273,20 +4273,21 @@ def _fix_phonetic_brackets(
    if not IPA_AVAILABLE:
        return entries

+    # IPA phonetics only appear in the ENGLISH field of vocab tables.
+    # German and example fields contain meaningful parenthetical content:
+    #   german:  "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
+    #   example: "(sich beschweren)", "(brauchen)", "(jammern)"
+    # These must NEVER be processed as phonetic transcriptions.
    replaced_count = 0
    for entry in entries:
-        for field in ('english', 'german', 'example'):
-            text = entry.get(field, '') or ''
-            # Check for any bracket type — Tesseract garbles [ into { or (
-            if not any(ch in text for ch in '[{('):
-                continue
-            new_text = _replace_phonetics_in_text(text, pronunciation)
-            if new_text != text:
-                logger.info(f"_fix_phonetic_brackets: {field} '{text}' → '{new_text}'")
-                replaced_count += 1
-            else:
-                logger.info(f"_fix_phonetic_brackets: no change for {field} '{text}'")
-            entry[field] = new_text
+        text = entry.get('english', '') or ''
+        if not any(ch in text for ch in '[{('):
+            continue
+        new_text = _replace_phonetics_in_text(text, pronunciation)
+        if new_text != text:
+            logger.info(f"_fix_phonetic_brackets: english '{text}' → '{new_text}'")
+            replaced_count += 1
+        entry['english'] = new_text

    logger.info(f"_fix_phonetic_brackets: {replaced_count} replacements in {len(entries)} entries")
    return entries
@@ -4310,15 +4311,13 @@ _GRAMMAR_BRACKET_WORDS = frozenset({


 def _is_grammar_bracket_content(content: str) -> bool:
-    """Return True if bracket content is grammar info or a German morpheme.
+    """Return True if bracket content is grammar info in the ENGLISH field.

    Grammar info:  cross (with), complain (about/of), agree (on/with)
-    German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen)
-    NOT grammar:   [breik], {'tfatno], (cy), ['kju:kambo]
+    NOT grammar:   [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]

-    Strategy: check each slash-separated token. If ALL tokens are known
-    grammar words or German affixes, it's grammar info. Otherwise it
-    might be garbled IPA.
+    Since we only process the English field, we only need to recognize
+    English grammar particles. Everything else is (garbled) IPA.
    """
    if not content:
        return False
@@ -4328,22 +4327,8 @@ def _is_grammar_bracket_content(content: str) -> bool:
    if not tokens:
        return False

-    for token in tokens:
-        # Each token must be pure letters
-        if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token):
-            return False
-        # Check if it's a known grammar word
-        if token in _GRAMMAR_BRACKET_WORDS:
-            continue
-        # German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen"
-        # But NOT things that look like garbled IPA transcriptions.
-        # Heuristic: if it's a common German suffix or a long word, keep it.
-        if len(token) >= 4:
-            continue
-        # Short unknown token — likely garbled IPA
-        return False
-
-    return True
+    # ALL tokens must be known grammar words
+    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)


 def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: