Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_ocr_ipa_repair.py
+++ b/klausur-service/backend/cv_ocr_ipa_repair.py
@@ -0,0 +1,287 @@
+"""
+Advanced IPA repair for OCR-extracted vocabulary.
+
+Functions that detect and fix garbled IPA fragments trailing after
+headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
+to stay within the 500 LOC budget.
+
+Contains:
+- _has_non_dict_trailing: detect non-dictionary trailing words
+- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
+- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
+- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from cv_vocab_types import IPA_AVAILABLE
+from cv_ocr_ipa_lookup import (
+    _lookup_ipa,
+    _GRAMMAR_BRACKET_WORDS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
+    """Check if text has a headword followed by non-dictionary trailing words.
+
+    Used as an additional trigger for ``_insert_missing_ipa`` when
+    ``_text_has_garbled_ipa`` returns False because the garbled IPA
+    happens to look like plain ASCII (e.g. "skea" for /skɛə/).
+    """
+    if not IPA_AVAILABLE:
+        return False
+    words = text.strip().split()
+    if len(words) < 2 or len(words) > 6:
+        return False
+    # Find first dictionary word
+    hw_idx = -1
+    for i, w in enumerate(words):
+        clean = re.sub(r'[^a-zA-Z\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        if _lookup_ipa(clean, pronunciation):
+            hw_idx = i
+            break
+    if hw_idx < 0 or hw_idx >= len(words) - 1:
+        return False
+    # Check ALL remaining words — if none are dictionary/delimiter/German,
+    # they are likely garbled IPA.
+    for j in range(hw_idx + 1, len(words)):
+        wj = words[j]
+        if wj in ('–', '—', '-', '/', '|', ',', ';'):
+            return False
+        # Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
+        if re.match(r'^[\d.)\-]+$', wj):
+            return False
+        clean_j = re.sub(r'[^a-zA-Z]', '', wj)
+        if clean_j and clean_j[0].isupper():
+            return False
+        if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
+            return False
+    return True
+
+
+def _strip_post_bracket_garbled(
+    text: str, pronunciation: str = 'british',
+) -> str:
+    """Strip garbled IPA fragments that trail after proper [IPA] brackets.
+
+    E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
+         ``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
+         ``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
+
+    For multi-word headwords like "seat belt", a real English word ("belt")
+    may be followed by garbled IPA duplicates.  We detect this by checking
+    whether the sequence after a real word contains IPA markers (`:`, `ə`,
+    etc.) — if so, everything from the first garbled token onward is stripped.
+    """
+    if ']' not in text:
+        return text
+    last_bracket = text.rfind(']')
+    if last_bracket >= len(text) - 1:
+        return text
+    before = text[:last_bracket + 1].rstrip()
+    after = text[last_bracket + 1:].strip()
+    if not after:
+        return text
+
+    _IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
+    after_words = after.split()
+    kept: List[str] = []
+    for idx, w in enumerate(after_words):
+        # Delimiter — keep rest
+        if w in ('–', '—', '-', '/', '|', ',', ';'):
+            kept.extend(after_words[idx:])
+            break
+        # Contains IPA markers (length mark, IPA chars) — garbled, skip
+        if any(c in w for c in _IPA_MARKER_CHARS):
+            # Everything from here is garbled IPA — stop scanning
+            # but look ahead: if any remaining words are real English
+            # words WITHOUT IPA markers, they might be a different headword
+            # following. Only skip the contiguous garbled run.
+            continue
+        clean = re.sub(r'[^a-zA-Z]', '', w)
+        # Uppercase — likely German, keep rest
+        if clean and clean[0].isupper():
+            kept.extend(after_words[idx:])
+            break
+        # Known English word — keep it, but check if followed by garbled IPA
+        # (multi-word headword case like "seat [siːt] belt si:t belt")
+        if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
+            # Peek ahead: if next word has IPA markers, the rest is garbled
+            remaining = after_words[idx + 1:]
+            has_garbled_after = any(
+                any(c in rw for c in _IPA_MARKER_CHARS)
+                for rw in remaining
+            )
+            if has_garbled_after:
+                # Keep this real word but stop — rest is garbled duplication
+                kept.append(w)
+                # Still scan for delimiters/German in the remaining words
+                for ridx, rw in enumerate(remaining):
+                    if rw in ('–', '—', '-', '/', '|', ',', ';'):
+                        kept.extend(remaining[ridx:])
+                        break
+                    rclean = re.sub(r'[^a-zA-Z]', '', rw)
+                    if rclean and rclean[0].isupper():
+                        kept.extend(remaining[ridx:])
+                        break
+                break
+            else:
+                kept.extend(after_words[idx:])
+                break
+        # Unknown short word — likely garbled, skip
+    if kept:
+        return before + ' ' + ' '.join(kept)
+    return before
+
+
+def fix_ipa_continuation_cell(
+    garbled_text: str,
+    headword_text: str,
+    pronunciation: str = 'british',
+) -> str:
+    """Replace garbled IPA in a continuation row with proper IPA.
+
+    Continuation rows appear below the headword and contain only the
+    printed phonetic transcription, which OCR garbles into fragments
+    like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
+
+    Args:
+        garbled_text: The OCR-garbled IPA text from the continuation row.
+        headword_text: The headword text from the previous row
+            (e.g. ``scarf – scarves``).
+        pronunciation: ``'british'`` or ``'american'``.
+
+    Returns:
+        Corrected IPA text, or the original if no fix could be applied.
+    """
+    if not IPA_AVAILABLE or not garbled_text or not headword_text:
+        return garbled_text
+
+    # If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
+    # only generate continuation IPA for words NOT already covered.
+    covered_words: set = set()
+    has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
+    if has_inline_ipa:
+        # Words before the first bracket already have their IPA shown
+        first_bracket = headword_text.index('[')
+        pre_bracket = headword_text[:first_bracket].strip()
+        for w in pre_bracket.split():
+            clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
+            if clean and len(clean) >= 2:
+                covered_words.add(clean)
+
+        last_bracket_end = headword_text.rfind(']')
+        tail = headword_text[last_bracket_end + 1:].strip()
+
+        if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
+            # Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
+            # — return the inline IPA directly (continuation duplicates it)
+            last_bracket_start = headword_text.rfind('[')
+            inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
+            return inline_ipa
+
+        # Only the tail words need continuation IPA
+        headword_text = tail
+
+    # Strip existing IPA brackets and parenthetical grammar annotations
+    # like "(no pl)", "(sth)", "(sb)" from headword text
+    clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
+    clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
+    if not clean_hw:
+        return garbled_text
+
+    # Split headword by delimiters (– — -)
+    # "scarf – scarves" → ["scarf", "scarves"]
+    # "see - saw - seen" → ["see", "saw", "seen"]
+    parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
+    parts = [p.strip() for p in parts if p.strip()]
+
+    if not parts:
+        return garbled_text
+
+    # Look up IPA for each headword part.
+    # Skip articles (the, a, an) — they never get IPA in vocab books.
+    # Other function words like "down", "up" are kept because they are
+    # integral parts of phrasal verbs (e.g. "close down").
+    # Skip words that already have inline IPA in the headword row.
+    _ARTICLES = {'the', 'a', 'an'}
+    ipa_parts: List[str] = []
+    for part in parts:
+        # A part may be multi-word like "secondary school"
+        words = part.split()
+        word_ipas: List[str] = []
+        for w in words:
+            clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
+            if not clean_w or len(clean_w) < 2:
+                continue
+            if covered_words and clean_w.lower() in covered_words:
+                continue  # Already has IPA inline in the headword
+            if clean_w.lower() in _ARTICLES:
+                continue  # Articles never get IPA in vocab books
+            ipa = _lookup_ipa(clean_w, pronunciation)
+            if ipa:
+                word_ipas.append(ipa)
+        if word_ipas:
+            ipa_parts.append('[' + ' '.join(word_ipas) + ']')
+
+    if not ipa_parts:
+        return garbled_text
+
+    # Join with delimiter
+    result = ' – '.join(ipa_parts)
+    logger.debug(
+        "fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
+        garbled_text, result, headword_text,
+    )
+    return result
+
+
+def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
+    """Insert IPA for the first English headword in a long mixed-language line.
+
+    Unlike _insert_missing_ipa (for short column_en cells), this handles
+    column_text lines of any length.  It only inserts IPA for the FIRST word
+    if that word:
+    - has no bracket following it already
+    - has an IPA entry in the dictionary
+    - is not a number/symbol prefix like "».55"
+
+    Returns the text with [ipa] inserted after the first word, or unchanged.
+    """
+    if not IPA_AVAILABLE:
+        return text
+    if not text or not text.strip():
+        return text
+
+    words = text.strip().split()
+    if not words:
+        return text
+
+    # Check if text already starts with a bracket (IPA already present)
+    if len(words) > 1 and words[1].startswith(('[', '{', '(')):
+        return text
+
+    # Try the first few words (skip numeric prefixes like "».55", "0.56")
+    for i in range(min(3, len(words))):
+        w = words[i]
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
+        if not clean or len(clean) < 2:
+            continue
+        if clean.lower() in _GRAMMAR_BRACKET_WORDS:
+            continue
+        ipa = _lookup_ipa(clean, pronunciation)
+        if ipa:
+            words[i] = f"{w} [{ipa}]"
+            return ' '.join(words)
+        # Stop at first real word even if no IPA found
+        break
+
+    return text