Fix IPA correction for dictionary pages (WIP)

- Fix Step 5h: restrict slash-IPA conversion to English headword column only — prevents converting "der/die/das" to "der [dər]das" in German columns (confirmed working) - Fix _text_has_garbled_ipa: detect embedded apostrophes in merged tokens like "Scotland'skotland" where OCR reads ˈ as ' - Fix _insert_missing_ipa: detect dictionary word prefix in merged trailing tokens like "fictionsalans'fIkfn" → extract "fiction" with IPA - Move en_col_type to wider scope for Step 5h access Note: Fixes 1+2 confirmed working in unit tests but not yet applying in the full build-grid pipeline — needs further debugging. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 23:54:14 +01:00
parent 4feec7c7b7
commit 9ea217bdfc
2 changed files with 25 additions and 1 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -858,6 +858,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    # Single/two-column layouts are continuous text, not vocab tables.
    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
+    en_col_type = None
    if total_cols >= 3:
        # Find the column that contains IPA brackets → English headwords.
        # Count cells with bracket patterns per col_type.  The column with
@@ -872,7 +873,6 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
            if ct.startswith("column_") and '[' in txt:
                col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
        # Pick column with most bracket IPA patterns
-        en_col_type = None
        if col_bracket_count:
            en_col_type = max(col_bracket_count, key=col_bracket_count.get)
        else:
@@ -1105,6 +1105,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    slash_ipa_fixed = 0
    for z in zones_data:
        for cell in z.get("cells", []):
+            # Only process English headword column — avoid converting
+            # German text like "der/die/das" to IPA.
+            if en_col_type and cell.get("col_type") != en_col_type:
+                continue
            text = cell.get("text", "")
            if "/" not in text:
                continue