Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_ocr_ipa_lookup.py
+++ b/klausur-service/backend/cv_ocr_ipa_lookup.py
@@ -0,0 +1,476 @@
+"""
+IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
+
+Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
+in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)).  This module
+provides functions to:
+
+- Look up correct IPA pronunciations (British/American) for English words.
+- Detect and replace garbled phonetic brackets with dictionary IPA.
+- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
+- Strip orphan brackets and post-bracket garbled fragments.
+- Handle IPA continuation cells (phonetics on a separate row from headword).
+
+All IPA data comes from open-source dictionaries:
+- Britfone (MIT) for British English
+- eng_to_ipa / CMU (MIT) for American English
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from cv_vocab_types import (
+    IPA_AVAILABLE,
+    _britfone_dict,
+    _ipa_convert_american,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# --- D. Phonetic Bracket IPA Replacement ---
+
+# Pattern: word followed by any bracket type containing phonetic content.
+# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
+# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
+# This intentionally matches mixed brackets (e.g. {content]) because
+# Tesseract frequently misrecognizes bracket characters.
+_PHONETIC_BRACKET_RE = re.compile(
+    r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
+)
+
+# Unicode IPA characters — used to distinguish correct IPA (from dictionary
+# lookup) from garbled OCR content when stripping orphan brackets.
+_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
+
+# Minimum word confidence for full-page Tesseract results (0-100).
+# Words below this threshold are OCR noise (scanner shadows, borders).
+_MIN_WORD_CONF = 30
+
+
+def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
+    """Look up IPA for a word using the selected pronunciation dictionary.
+
+    Args:
+        word: English word to look up.
+        pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
+
+    Returns:
+        IPA string or None if not found.
+    """
+    word_lower = word.lower().strip()
+    if not word_lower:
+        return None
+
+    if pronunciation == 'british' and _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+        # Fallback to American if not in Britfone
+        if _ipa_convert_american:
+            result = _ipa_convert_american(word_lower)
+            if result and '*' not in result:
+                return result
+        return None
+
+    if pronunciation == 'american' and _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+        # Fallback to Britfone if not in CMU
+        if _britfone_dict:
+            ipa = _britfone_dict.get(word_lower)
+            if ipa:
+                return ipa
+        return None
+
+    # Try any available source
+    if _britfone_dict:
+        ipa = _britfone_dict.get(word_lower)
+        if ipa:
+            return ipa
+    if _ipa_convert_american:
+        result = _ipa_convert_american(word_lower)
+        if result and '*' not in result:
+            return result
+
+    return None
+
+
+def _fix_phonetic_brackets(
+    entries: List[Dict[str, Any]],
+    pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
+    """Replace OCR'd phonetic transcriptions with dictionary IPA.
+
+    Detects patterns like "dance [du:ns]" and replaces with correct IPA:
+    - British: "dance [dˈɑːns]"  (Britfone, MIT)
+    - American: "dance [dæns]"    (eng_to_ipa/CMU, MIT)
+
+    Only replaces if the word before brackets is found in the dictionary.
+    """
+    if not IPA_AVAILABLE:
+        return entries
+
+    # IPA phonetics only appear in the ENGLISH field of vocab tables.
+    # German and example fields contain meaningful parenthetical content:
+    #   german:  "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
+    #   example: "(sich beschweren)", "(brauchen)", "(jammern)"
+    # These must NEVER be processed as phonetic transcriptions.
+    replaced_count = 0
+    for entry in entries:
+        text = entry.get('english', '') or ''
+        if not any(ch in text for ch in '[{('):
+            continue
+        new_text = _replace_phonetics_in_text(text, pronunciation)
+        if new_text != text:
+            logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
+            replaced_count += 1
+        entry['english'] = new_text
+
+    if replaced_count:
+        logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
+    return entries
+
+
+# Grammar particles that appear in brackets after English words:
+#   cross (with), complain (about/of), agree (on/with), look (sth) up
+# These must NOT be replaced with IPA.  Only used for the English field
+# (German/example fields are never processed for IPA replacement).
+_GRAMMAR_BRACKET_WORDS = frozenset({
+    # English prepositions/particles commonly in vocab tables
+    'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
+    'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
+    # English grammar abbreviations used in vocab tables
+    'sth', 'sb', 'adj', 'adv',
+    # Number/plural/grammar annotations
+    'pl', 'sg', 'sing', 'no', 'also', 'auch',
+    # Regional English markers
+    'ae', 'be', 'ame', 'bre',
+})
+
+
+def _is_grammar_bracket_content(content: str) -> bool:
+    """Return True if bracket content is grammar info in the ENGLISH field.
+
+    Grammar info:  cross (with), complain (about/of), agree (on/with)
+    NOT grammar:   [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
+
+    Since we only process the English field, we only need to recognize
+    English grammar particles. Everything else is (garbled) IPA.
+    """
+    if not content:
+        return False
+
+    # Split on / and spaces for patterns like (about/of), (no pl)
+    tokens = re.split(r'[/\s]+', content.strip().lower())
+    tokens = [t for t in tokens if t]
+    if not tokens:
+        return False
+
+    # ALL tokens must be known grammar words
+    return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
+
+
+def _replace_phonetics_in_text(
+    text: str,
+    pronunciation: str = 'british',
+    strip_orphans: bool = True,
+) -> str:
+    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
+
+    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
+    We match any bracket type and replace with dictionary IPA if found.
+    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
+
+    Args:
+        strip_orphans: If True, strip orphan brackets that look like garbled IPA.
+            Set to False for column_text where brackets may be German content.
+    """
+    if not IPA_AVAILABLE:
+        return text
+
+    def replacer(match):
+        word = match.group(1)
+        bracket_content = match.group(2).strip()
+        full_match = match.group(0)
+
+        # Skip if bracket content looks like regular text (multiple words)
+        if len(bracket_content.split()) > 3:
+            return full_match
+
+        # Look up IPA for the word before brackets
+        ipa = _lookup_ipa(word, pronunciation)
+
+        if ipa:
+            # Word has IPA → bracket content is phonetic (garbled or correct).
+            # Exception: grammar particles like cross (with) — keep those.
+            if _is_grammar_bracket_content(bracket_content):
+                return full_match
+            logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
+            return f"{word} [{ipa}]"
+
+        # No IPA for this word — keep as-is
+        return full_match
+
+    text = _PHONETIC_BRACKET_RE.sub(replacer, text)
+
+    if strip_orphans:
+        # Second pass: strip remaining orphan brackets that are garbled IPA.
+        # These have no word before them (the main regex requires \b word \s* bracket).
+        # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
+        # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
+        def _strip_orphan_bracket(m):
+            content = m.group(1).strip()
+            # Keep grammar info: (sich beschweren), (about/of)
+            if _is_grammar_bracket_content(content):
+                return m.group(0)
+            # Keep correct IPA (contains Unicode IPA characters)
+            if any(ch in _IPA_CHARS for ch in content):
+                return m.group(0)
+            # Keep real-word parentheticals like (probieren), (Profit), (Geld).
+            # Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
+            # — they never contain a real word ≥4 letters with proper casing.
+            content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
+            if len(content_alpha) >= 4:
+                return m.group(0)
+            logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
+            return ''
+
+        text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
+
+    text = text.strip()
+
+    return text
+
+
+def _text_has_garbled_ipa(text: str) -> bool:
+    """Check if text contains garbled IPA-like fragments from OCR.
+
+    Returns True if there is evidence of OCR-mangled phonetic
+    transcription, e.g. stress marks, length marks, or IPA special chars.
+    This is used to decide whether ``_insert_missing_ipa`` should run:
+    it must only insert IPA to *replace* garbled phonetics that are already
+    in the text — never to ADD phonetics where none existed on the page.
+    """
+    # Bracketed text that doesn't contain valid IPA symbols is garbled OCR
+    # of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
+    stripped = text.strip()
+    if stripped.startswith('[') and stripped.endswith(']'):
+        inner = stripped[1:-1]
+        # Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
+        if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
+            # Not a valid dictionary-style bracket like "(no pl)" — those
+            # use parentheses, not square brackets.  Square brackets with
+            # no IPA chars are garbled phonetics.
+            return True
+
+    for w in text.strip().split():
+        # Skip delimiters and very short tokens
+        if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
+            continue
+        # Starts with stress mark (OCR read IPA stress ' as apostrophe)
+        if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
+            return True
+        if w.startswith("\u02c8") or w.startswith("\u02cc"):  # ˈ ˌ
+            return True
+        # Contains IPA length mark ':' in a short non-word fragment
+        if ':' in w and len(w) < 12:
+            # But not things like "3:00" (time) or common words
+            stripped = re.sub(r'[^a-zA-Z:]', '', w)
+            if ':' in stripped and not stripped.replace(':', '').isalpha():
+                continue
+            return True
+        # Contains IPA special characters
+        if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
+            return True
+        # Embedded apostrophe suggesting merged garbled IPA with stress mark.
+        # E.g. "Scotland'skotland" — OCR reads ˈ as '.
+        # Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
+        # chars to avoid contractions (don't, won't, o'clock).
+        if "'" in w and not w.startswith("'"):
+            apos_idx = w.index("'")
+            after = w[apos_idx + 1:]
+            if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
+                return True
+    return False
+
+
+def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
+    """Try to decompose a compound word and concatenate IPA for each part.
+
+    E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
+    Only returns IPA if ALL parts are found in the dictionary.
+
+    Tries splits at every position (min 3 chars per part) and picks the
+    split where the first part is longest.
+    """
+    if not IPA_AVAILABLE:
+        return None
+    lower = word.lower().strip()
+    if len(lower) < 6:
+        return None  # too short for a compound
+
+    best_ipa = None
+    best_first_len = 0
+
+    for split_pos in range(3, len(lower) - 2):  # min 3 chars each part
+        first = lower[:split_pos]
+        second = lower[split_pos:]
+        ipa_first = _lookup_ipa(first, pronunciation)
+        ipa_second = _lookup_ipa(second, pronunciation)
+        if ipa_first and ipa_second:
+            if split_pos > best_first_len:
+                best_first_len = split_pos
+                best_ipa = ipa_first + ipa_second
+
+    return best_ipa
+
+
+def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
+    """Insert IPA pronunciation for English words that have no brackets at all.
+
+    OCR sometimes garbles the phonetic transcription into plain-text fragments
+    (e.g. "scare skea" where "skea" is garbled /skɛə/).  This scans the text
+    for the headword, inserts correct [IPA], and strips the garbled fragments.
+
+    Only inserts for words that:
+    - are standalone (not already followed by a bracket)
+    - have an IPA entry in the dictionary
+    - appear to be English headwords (at the start of text or after common
+      separators like ",", ";", "•")
+
+    This is intentionally conservative: it only inserts at the END of each
+    whitespace-separated token group to avoid breaking phrases.
+    """
+    if not IPA_AVAILABLE:
+        return text
+    if not text or not text.strip():
+        return text
+
+    # Skip if already has brackets (IPA replacement handles those)
+    if any(ch in text for ch in '[{('):
+        return text
+
+    # Only process short text fragments (typical vocab cells).
+    # Long sentences / paragraphs should not get IPA insertions.
+    words = text.strip().split()
+    if len(words) > 6:
+        return text
+
+    # Try to insert IPA for the first alphanumeric word
+    # Typical patterns: "challenge", "profit", "film", "badge"
+    for i, w in enumerate(words):
+        # Clean punctuation for lookup
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        # Skip German/grammar words
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        ipa = _lookup_ipa(clean, pronunciation)
+        # Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
+        if not ipa and '-' in clean:
+            ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
+        # Fallback 0b: compound word decomposition
+        # E.g. "schoolbag" → "school"+"bag" → concatenated IPA
+        if not ipa:
+            ipa = _decompose_compound(clean, pronunciation)
+        # Fallback 1: IPA-marker split for merged tokens where OCR
+        # joined headword with its IPA (e.g. "schoolbagsku:lbæg").
+        # Find the first IPA marker character (:, æ, ɪ, etc.), walk
+        # backwards ≤3 chars for the onset consonant cluster, and
+        # split into headword + OCR IPA.
+        _IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
+        if not ipa:
+            first_marker = next(
+                (p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
+            )
+            if first_marker >= 3:
+                split = first_marker
+                while (split > 0
+                       and split > first_marker - 3
+                       and w[split - 1].isalpha()
+                       and w[split - 1].islower()):
+                    split -= 1
+                if split >= 2:
+                    headword = w[:split]
+                    ocr_ipa = w[split:]
+                    hw_ipa = _lookup_ipa(headword, pronunciation)
+                    if not hw_ipa:
+                        # Try compound decomposition for the headword part
+                        hw_ipa = _decompose_compound(headword, pronunciation)
+                    if hw_ipa:
+                        words[i] = f"{headword} [{hw_ipa}]"
+                    else:
+                        # Word not in dictionary — use OCR IPA
+                        words[i] = f"{headword} [{ocr_ipa}]"
+                    words = words[:i + 1]
+                    ipa = True  # signal that we handled it
+                    break
+        # Fallback 2: prefix matching for merged tokens WITHOUT IPA
+        # markers (e.g. "Scotland'skotland").  Find longest dictionary
+        # prefix using only alpha chars to avoid punctuation matches.
+        if not ipa:
+            alpha = re.sub(r'[^a-zA-Z]', '', clean)
+            if len(alpha) > 5:  # need at least 6 chars for meaningful split
+                for end in range(len(alpha), 3, -1):  # min prefix 4 chars
+                    prefix = alpha[:end]
+                    test_ipa = _lookup_ipa(prefix, pronunciation)
+                    if test_ipa:
+                        ipa = test_ipa
+                        w = prefix
+                        words[i] = prefix
+                        break
+        if ipa:
+            words[i] = f"{w} [{ipa}]"
+            # Strip garbled OCR phonetics after the IPA bracket.
+            # On scanned vocab pages, printed IPA is read as garbled
+            # text (e.g. "scare skea" where "skea" is garbled /skɛə/).
+            # After inserting correct IPA, remove remaining words that
+            # aren't real English words, delimiters, or German text.
+            kept = words[:i + 1]
+            for j in range(i + 1, len(words)):
+                wj = words[j]
+                # Delimiter — keep this and everything after
+                if wj in ('–', '—', '-', '/', '|', ',', ';'):
+                    kept.extend(words[j:])
+                    break
+                # Pure digits or numbering (e.g. "1", "2.", "3)") — keep
+                if re.match(r'^[\d.)\-]+$', wj):
+                    kept.extend(words[j:])
+                    break
+                # Starts with uppercase — likely German or proper noun
+                clean_j = re.sub(r'[^a-zA-Z]', '', wj)
+                if clean_j and clean_j[0].isupper():
+                    kept.extend(words[j:])
+                    break
+                # Known English word (≥2 chars) — keep it and rest
+                if clean_j and len(clean_j) >= 2:
+                    if _lookup_ipa(clean_j, pronunciation):
+                        kept.extend(words[j:])
+                        break
+                # Merged token: dictionary word + garbled IPA stuck together.
+                # E.g. "fictionsalans'fIkfn" starts with "fiction".
+                # Extract the dictionary prefix (≥4 chars) and add it with
+                # IPA, but only if enough chars remain after the prefix (≥3)
+                # to look like garbled IPA, not just a plural 's'.
+                if clean_j and len(clean_j) >= 7:
+                    for pend in range(min(len(clean_j) - 3, 15), 3, -1):
+                        prefix_j = clean_j[:pend]
+                        prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
+                        if prefix_ipa:
+                            kept.append(f"{prefix_j} [{prefix_ipa}]")
+                            break
+                    break  # rest of this token is garbled
+                # Otherwise — likely garbled phonetics, skip
+            words = kept
+            break
+
+    return ' '.join(words)
+
+