fix: broaden phonetic bracket regex to catch Tesseract-garbled IPA

Tesseract mangles IPA square brackets into curly braces or parentheses (e.g. China [ˈtʃaɪnə] → China {'tfatno]). The previous regex only matched [...], missing all garbled variants. - Match any bracket type: [...], {...}, (...) including mixed pairs - Add _is_meaningful_bracket_content() to preserve legitimate German prefixes like (zer)brechen and Tanz(veranstaltung) - Trigger IPA replacement on any bracket character, not just [ Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix: use Tesseract as default engine for cell-first OCR instead of RapidOCR
2026-03-04 22:53:50 +01:00 · 2026-03-04 22:30:34 +01:00 · 2026-03-04 22:21:14 +01:00
2 changed files with 82 additions and 16 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -4201,9 +4201,11 @@ def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, A
 # --- D. Phonetic Bracket IPA Replacement ---
-# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets
+# Pattern: word followed by any bracket type containing phonetic content.
 # Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
 # Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
 _PHONETIC_BRACKET_RE = re.compile(
-    r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]'
+    r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
 )
@@ -4274,25 +4276,64 @@ def _fix_phonetic_brackets(
    for entry in entries:
        for field in ('english', 'german', 'example'):
            text = entry.get(field, '') or ''
-            if '[' not in text:
+            # Check for any bracket type — Tesseract garbles [ into { or (
            if not any(ch in text for ch in '[{('):
                continue
            entry[field] = _replace_phonetics_in_text(text, pronunciation)
    return entries
 # German prefixes/words commonly in parentheses in vocab tables: (zer)brechen, Tanz(veranstaltung)
 _GERMAN_BRACKET_PREFIXES = frozenset({
    'ab', 'an', 'auf', 'aus', 'be', 'bei', 'dar', 'ein', 'emp', 'ent',
    'er', 'ge', 'her', 'hin', 'los', 'mit', 'nach', 'um', 'un', 'unter',
    'ver', 'vor', 'weg', 'zer', 'zu', 'zurück',
 })
 def _is_meaningful_bracket_content(content: str) -> bool:
    """Return True if bracket content is a meaningful word/prefix, not garbled IPA.
    Meaningful: (zer)brechen, (veranstaltung), (Haupt)stadt
    Garbled IPA: {'tfatno, (cy, 1u], 'daens
    """
    if not content:
        return False
    # Must be pure letters (no digits, punctuation, IPA symbols)
    if not re.match(r'^[a-zA-ZäöüÄÖÜßé]+$', content):
        return False
    # Known German prefix
    if content.lower() in _GERMAN_BRACKET_PREFIXES:
        return True
    # Long enough to be a real word (not 1-2 char garbled IPA like "cy")
    if len(content) >= 4:
        return True
    return False
 def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
-    """Replace [phonetic] after words with dictionary IPA."""
+    """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
    Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
    We match any bracket type and replace with dictionary IPA if found.
    Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
    """
    if not IPA_AVAILABLE:
        return text
    def replacer(match):
        word = match.group(1)
-        ocr_phonetic = match.group(2)
+        bracket_content = match.group(2).strip()
-        # Skip if bracket content looks like regular text (has spaces + capitals)
+        # Skip if bracket content looks like regular text (multiple words)
-        if len(ocr_phonetic.split()) > 3:
+        if len(bracket_content.split()) > 3:
-            return match.group(0)  # Keep original
+            return match.group(0)
        # Skip if bracket content is a meaningful word/prefix — e.g. (zer)brechen,
        # Tanz(veranstaltung). These are real German morphemes, not garbled IPA.
        if _is_meaningful_bracket_content(bracket_content):
            return match.group(0)
        # Look up in IPA dictionary
        ipa = _lookup_ipa(word, pronunciation)
@@ -4886,13 +4927,15 @@ def build_cell_grid_v2(
    Drop-in replacement for build_cell_grid() — same signature & return type.
    No full-page word assignment; each cell is OCR'd from its own crop.
    """
-    # Resolve engine
+    # Resolve engine — default to Tesseract for cell-first OCR.
    # Tesseract excels at isolated text crops (binarized, upscaled).
    # RapidOCR is optimized for full-page scene-text and produces artifacts
    # on small cell crops (extra chars, missing punctuation, garbled IPA).
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "tesseract"
        engine_name = "rapid" if use_rapid else "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
@@ -5034,13 +5077,15 @@ def build_cell_grid_v2_streaming(
    Yields:
        (cell_dict, columns_meta, total_cells)
    """
-    # Resolve engine
+    # Resolve engine — default to Tesseract for cell-first OCR.
    # Tesseract excels at isolated text crops (binarized, upscaled).
    # RapidOCR is optimized for full-page scene-text and produces artifacts
    # on small cell crops (extra chars, missing punctuation, garbled IPA).
    use_rapid = False
    if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
        engine_name = ocr_engine
    elif ocr_engine == "auto":
-        use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+        engine_name = "tesseract"
        engine_name = "rapid" if use_rapid else "tesseract"
    elif ocr_engine == "rapid":
        if not RAPIDOCR_AVAILABLE:
            logger.warning("RapidOCR requested but not available, falling back to Tesseract")
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1416,9 +1416,11 @@ async def _word_batch_stream_generator(
    # 2. Send preparing event (keepalive for proxy)
    yield f"data: {json.dumps({'type': 'preparing', 'message': 'Cell-First OCR laeuft parallel...'})}\n\n"
-    # 3. Run batch OCR in thread pool (CPU-bound, don't block event loop)
+    # 3. Run batch OCR in thread pool with periodic keepalive events.
    # The OCR takes 30-60s and proxy servers (Nginx) may drop idle SSE
    # connections after 30-60s. Send keepalive every 5s to prevent this.
    loop = asyncio.get_event_loop()
-    cells, columns_meta = await loop.run_in_executor(
+    ocr_future = loop.run_in_executor(
        None,
        lambda: build_cell_grid_v2(
            ocr_img, col_regions, row_geoms, img_w, img_h,
@@ -1426,6 +1428,25 @@ async def _word_batch_stream_generator(
        ),
    )
    # Send keepalive events every 5 seconds while OCR runs
    keepalive_count = 0
    while not ocr_future.done():
        try:
            cells, columns_meta = await asyncio.wait_for(
                asyncio.shield(ocr_future), timeout=5.0,
            )
            break  # OCR finished
        except asyncio.TimeoutError:
            keepalive_count += 1
            elapsed = int(time.time() - t0)
            yield f"data: {json.dumps({'type': 'keepalive', 'elapsed': elapsed, 'message': f'OCR laeuft... ({elapsed}s)'})}\n\n"
            if await request.is_disconnected():
                logger.info(f"SSE batch: client disconnected during OCR for {session_id}")
                ocr_future.cancel()
                return
    else:
        cells, columns_meta = ocr_future.result()
    if await request.is_disconnected():
        logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
        return