Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_ocr_cell_phonetics.py
+++ b/klausur-service/backend/cv_ocr_cell_phonetics.py
@@ -0,0 +1,189 @@
+"""Cell-level IPA phonetic fixes for overlay mode.
+
+In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
+(entry['english']).  But the overlay reads cell['text'] directly, so
+phonetic fixes must be applied to cells too.
+
+Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_vocab_types import IPA_AVAILABLE
+
+from cv_ocr_ipa_lookup import (
+    _insert_missing_ipa,
+    _replace_phonetics_in_text,
+    _text_has_garbled_ipa,
+)
+from cv_ocr_ipa_repair import (
+    _has_non_dict_trailing,
+    _insert_headword_ipa,
+    _strip_post_bracket_garbled,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def fix_cell_phonetics(
+    cells: List[Dict[str, Any]],
+    pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
+    """Apply IPA phonetic fixes to cell texts for overlay mode.
+
+    In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
+    (entry['english']).  But the overlay reads cell['text'] directly, so
+    phonetic fixes must be applied to cells too.
+
+    Processing depends on column type:
+    - column_en: Full processing (replace garbled IPA + strip orphan brackets
+      + insert missing IPA). Safe because these cells contain only English
+      headwords.
+    - column_text: Light processing (replace garbled IPA ONLY). No orphan
+      bracket stripping (brackets may be German content like "(probieren)")
+      and no IPA insertion (would add tokens and break overlay positioning).
+    """
+    if not IPA_AVAILABLE:
+        return cells
+
+    ipa_col_types = {'column_en', 'column_text'}
+    replaced = 0
+
+    for cell in cells:
+        col_type = cell.get('col_type', '')
+        if col_type not in ipa_col_types:
+            continue
+        text = cell.get('text', '') or ''
+        if not text.strip():
+            continue
+
+        if col_type == 'column_en':
+            # Full processing: replace garbled IPA, strip orphan brackets.
+            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
+            if new_text == text:
+                # Insert IPA when garbled phonetics exist OR when trailing
+                # non-dictionary words suggest garbled IPA in plain ASCII.
+                if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
+                    new_text = _insert_missing_ipa(text, pronunciation)
+            # Strip trailing garbled fragments after proper [IPA] brackets
+            # (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
+            if ']' in new_text:
+                new_text = _strip_post_bracket_garbled(new_text, pronunciation)
+        else:
+            # column_text: replace garbled IPA, no orphan stripping
+            new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
+            # Insert headword IPA ONLY if there's a gap in word_boxes
+            # suggesting Tesseract missed an IPA bracket on the page.
+            # Without gap evidence, the original page had no IPA.
+            if new_text == text:
+                wb = cell.get('word_boxes', [])
+                if _has_ipa_gap(text, wb):
+                    inserted = _insert_headword_ipa(text, pronunciation)
+                    if inserted != text:
+                        new_text = inserted
+                        _sync_word_boxes_after_ipa_insert(cell, text, new_text)
+
+        if new_text != text:
+            logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
+            cell['text'] = new_text
+            replaced += 1
+
+    if replaced:
+        logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
+    return cells
+
+
+def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
+    """Check if word_boxes show a gap where IPA brackets should be.
+
+    On a typical vocab page, the layout is:
+        headword [ipa]              German translation
+
+    If Tesseract missed the IPA bracket, the gap between the headword
+    and the next word (German translation) is unusually large (>80px)
+    because the IPA occupied physical space on the page.
+
+    If no IPA was on the page (e.g. "be good at sth."), the words are
+    close together (<30px).
+    """
+    if not word_boxes or len(word_boxes) < 2:
+        return False
+
+    tokens = text.split()
+    if not tokens:
+        return False
+
+    # Find the headword index: skip numeric prefixes like "».55", "0.56"
+    hw_box_idx = 0
+    for i, wb in enumerate(word_boxes):
+        wt = wb.get('text', '')
+        clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
+        if len(clean) >= 2:
+            hw_box_idx = i
+            break
+
+    if hw_box_idx >= len(word_boxes) - 1:
+        return False
+
+    # Check gap between headword and the next word_box
+    hw = word_boxes[hw_box_idx]
+    next_wb = word_boxes[hw_box_idx + 1]
+    gap = next_wb['left'] - (hw['left'] + hw['width'])
+
+    return gap > 80
+
+
+def _sync_word_boxes_after_ipa_insert(
+    cell: Dict[str, Any],
+    old_text: str,
+    new_text: str,
+) -> None:
+    """Insert a synthetic word_box for an IPA token added by IPA insertion.
+
+    E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
+    Adds a new word_box right after the headword's box so the 1:1
+    token-to-box mapping in the frontend overlay stays consistent.
+    """
+    word_boxes = cell.get('word_boxes')
+    if not word_boxes:
+        return
+
+    old_tokens = old_text.split()
+    new_tokens = new_text.split()
+
+    if len(new_tokens) != len(old_tokens) + 1:
+        return  # unexpected change, skip
+
+    # Find the inserted token by walking both lists in parallel.
+    # One token in new_tokens won't match — that's the inserted IPA.
+    insert_idx = -1
+    j = 0  # index into old_tokens
+    for i in range(len(new_tokens)):
+        if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
+            j += 1
+        else:
+            insert_idx = i
+            break
+
+    if insert_idx < 0 or insert_idx >= len(new_tokens):
+        return
+
+    ipa_token = new_tokens[insert_idx]
+
+    # The headword is at insert_idx - 1 in old_tokens (and word_boxes)
+    ref_idx = insert_idx - 1
+    if ref_idx < 0 or ref_idx >= len(word_boxes):
+        return
+
+    ref_box = word_boxes[ref_idx]
+    ipa_box = {
+        'text': ipa_token,
+        'left': ref_box['left'] + ref_box['width'] + 2,
+        'top': ref_box['top'],
+        'width': ref_box['width'],
+        'height': ref_box['height'],
+        'conf': ref_box.get('conf', 90),
+    }
+    word_boxes.insert(insert_idx, ipa_box)