diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 88075bb..88061cd 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -4292,32 +4292,58 @@ def _fix_phonetic_brackets( return entries -# German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung) -_GERMAN_BRACKET_PREFIXES = frozenset({ - 'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent', - 'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter', - 'ver', 'vor', 'weg', 'zer', 'zu', 'zurück', +# Bracket content that is grammar info, not phonetic: cross (with), complain (about/of) +# Also German prefixes: (zer)brechen, Tanz(veranstaltung), Schild(chen) +# These should NEVER be replaced with IPA. +_GRAMMAR_BRACKET_WORDS = frozenset({ + # English prepositions/particles commonly in vocab tables + 'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by', + 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through', + # German prepositions/particles + 'ab', 'an', 'auf', 'aus', 'bei', 'dar', 'ein', 'für', 'her', 'hin', + 'los', 'mit', 'nach', 'um', 'unter', 'von', 'vor', 'weg', 'zu', 'zurück', + # German verb prefixes (in parentheses before verb stems) + 'be', 'emp', 'ent', 'er', 'ge', 'un', 'ver', 'zer', + # Abbreviations + 'sth', 'sb', 'adj', 'adv', }) -def _is_meaningful_bracket_content(content: str) -> bool: - """Return True if bracket content is a meaningful word/prefix, not garbled IPA. +def _is_grammar_bracket_content(content: str) -> bool: + """Return True if bracket content is grammar info or a German morpheme. - Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt - Garbled IPA: {'tfatno, (cy, 1u], 'daens + Grammar info: cross (with), complain (about/of), agree (on/with) + German prefix: (zer)brechen, Tanz(veranstaltung), Schild(chen) + NOT grammar: [breik], {'tfatno], (cy), ['kju:kambo] + + Strategy: check each slash-separated token. If ALL tokens are known + grammar words or German affixes, it's grammar info. Otherwise it + might be garbled IPA. """ if not content: return False - # Must be pure letters (no digits, punctuation, IPA symbols) - if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content): + + # Split on / for patterns like (about/of), (on/with) + tokens = [t.strip().lower() for t in content.split('/') if t.strip()] + if not tokens: return False - # Known German prefix - if content.lower() in _GERMAN_BRACKET_PREFIXES: - return True - # Long enough to be a real word (not 1-2 char garbled IPA like "cy") - if len(content) >= 4: - return True - return False + + for token in tokens: + # Each token must be pure letters + if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', token): + return False + # Check if it's a known grammar word + if token in _GRAMMAR_BRACKET_WORDS: + continue + # German suffix/word part (>= 4 chars, all letters) — e.g. "veranstaltung", "chen" + # But NOT things that look like garbled IPA transcriptions. + # Heuristic: if it's a common German suffix or a long word, keep it. + if len(token) >= 4: + continue + # Short unknown token — likely garbled IPA + return False + + return True def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: @@ -4340,22 +4366,51 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str logger.info(f" phonetic replacer: SKIP (too many words) '{full_match}'") return full_match - # Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen, - # Tanz(veranstaltung). These are real German morphemes, not garbled IPA. - if _is_meaningful_bracket_content(bracket_content): - logger.info(f" phonetic replacer: SKIP (meaningful content) '{full_match}'") + # Look up IPA for the word before brackets + ipa = _lookup_ipa(word, pronunciation) + + if ipa: + # Word has IPA → bracket content is phonetic (garbled or correct). + # Exception: grammar particles like cross (with) — keep those. + if _is_grammar_bracket_content(bracket_content): + # Grammar info followed by garbled IPA? E.g. "cross (with) [kros]" + # Keep the grammar part, IPA will be handled as orphan bracket. + logger.info(f" phonetic replacer: SKIP (grammar info) '{full_match}'") + return full_match + logger.info(f" phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'") + return f"{word} [{ipa}]" + + # No IPA for this word — keep grammar info, strip garbled IPA + if _is_grammar_bracket_content(bracket_content): + logger.info(f" phonetic replacer: SKIP (grammar, no IPA) '{full_match}'") return full_match - # Look up in IPA dictionary - ipa = _lookup_ipa(word, pronunciation) - if not ipa: - logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'") - return full_match # Keep original + logger.info(f" phonetic replacer: SKIP (no IPA for '{word}') '{full_match}'") + return full_match - logger.info(f" phonetic replacer: REPLACE '{full_match}' → '{word} [{ipa}]'") - return f"{word} [{ipa}]" + text = _PHONETIC_BRACKET_RE.sub(replacer, text) - return _PHONETIC_BRACKET_RE.sub(replacer, text) + # Second pass: strip remaining orphan brackets that are garbled IPA. + # These have no word before them (the main regex requires \b word \s* bracket). + # Examples: "[mais]", "{'mani setva]", trailing "(kros]" + # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" + _IPA_CHARS = set('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔ̩̃ʊɐ') + + def _strip_orphan_bracket(m): + content = m.group(1).strip() + # Keep grammar info: (sich beschweren), (auf), (about/of) + if _is_grammar_bracket_content(content): + return m.group(0) + # Keep correct IPA (contains Unicode IPA characters) + if any(ch in _IPA_CHARS for ch in content): + return m.group(0) + logger.info(f" phonetic: stripping orphan bracket '{m.group(0)}'") + return '' + + text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text) + text = text.strip() + + return text def _assign_row_words_to_columns(