feat: IPA-Lautschrift in Cell-Texte einfuegen (fuer Overlay-Modus)

fix_cell_phonetics() ersetzt fehlerhafte IPA-Klammern UND fuegt fehlende Lautschrift fuer englische Woerter ein (z.B. badge, film, challenge, profit). Wird auf alle Zellen mit col_type column_en/column_text angewandt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 15:47:26 +01:00
parent 8a5f2aa188
commit 2f51ac617f
3 changed files with 231 additions and 2 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -43,6 +43,7 @@ from cv_vocab_pipeline import (
    _detect_sub_columns,
    _fix_character_confusion,
    _fix_phonetic_brackets,
+    fix_cell_phonetics,
    analyze_layout,
    analyze_layout_by_words,
    build_cell_grid,
@@ -2030,6 +2031,9 @@ async def detect_words(
    # Determine which engine was actually used
    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine

+    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
+    fix_cell_phonetics(cells, pronunciation=pronunciation)
+
    # Grid result (always generic)
    word_result = {
        "cells": cells,
@@ -2169,11 +2173,14 @@ async def _word_batch_stream_generator(
        logger.info(f"SSE batch: client disconnected after OCR for {session_id}")
        return

-    # 4. Send columns meta
+    # 4. Apply IPA phonetic fixes directly to cell texts (for overlay mode)
+    fix_cell_phonetics(cells, pronunciation=pronunciation)
+
+    # 5. Send columns meta
    if columns_meta:
        yield f"data: {json.dumps({'type': 'columns', 'columns_used': columns_meta})}\n\n"

-    # 5. Stream all cells
+    # 6. Stream all cells
    for idx, cell in enumerate(cells):
        cell_event = {
            "type": "cell",
@@ -2323,6 +2330,9 @@ async def _word_stream_generator(

    used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine

+    # Apply IPA phonetic fixes directly to cell texts (for overlay mode)
+    fix_cell_phonetics(all_cells, pronunciation=pronunciation)
+
    word_result = {
        "cells": all_cells,
        "grid_shape": {
@@ -3996,6 +4006,9 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
                n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
                used_engine = cells[0].get("ocr_engine", "tesseract") if cells else req.ocr_engine

+                # Apply IPA phonetic fixes directly to cell texts
+                fix_cell_phonetics(cells, pronunciation=req.pronunciation)
+
                word_result_data = {
                    "cells": cells,
                    "grid_shape": {