diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 8adacc6..f0a7cb8 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -1030,6 +1030,15 @@ def _text_has_garbled_ipa(text: str) -> bool: # Contains IPA special characters if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'): return True + # Embedded apostrophe suggesting merged garbled IPA with stress mark. + # E.g. "Scotland'skotland" — OCR reads ˈ as '. + # Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase + # chars to avoid contractions (don't, won't, o'clock). + if "'" in w and not w.startswith("'"): + apos_idx = w.index("'") + after = w[apos_idx + 1:] + if apos_idx >= 3 and len(after) >= 3 and after[0].islower(): + return True return False @@ -1183,6 +1192,17 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str: if _lookup_ipa(clean_j, pronunciation): kept.extend(words[j:]) break + # Merged token: dictionary word + garbled IPA stuck together. + # E.g. "fictionsalans'fIkfn" starts with "fiction". + # Extract the dictionary prefix and add it with IPA. + if clean_j and len(clean_j) >= 5: + for pend in range(min(len(clean_j), 15), 2, -1): + prefix_j = clean_j[:pend] + prefix_ipa = _lookup_ipa(prefix_j, pronunciation) + if prefix_ipa: + kept.append(f"{prefix_j} [{prefix_ipa}]") + break + break # rest of this token is garbled # Otherwise — likely garbled phonetics, skip words = kept break diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 1fa99d0..91011bd 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -858,6 +858,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # Single/two-column layouts are continuous text, not vocab tables. all_cells = [cell for z in zones_data for cell in z.get("cells", [])] total_cols = sum(len(z.get("columns", [])) for z in zones_data) + en_col_type = None if total_cols >= 3: # Find the column that contains IPA brackets → English headwords. # Count cells with bracket patterns per col_type. The column with @@ -872,7 +873,6 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if ct.startswith("column_") and '[' in txt: col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1 # Pick column with most bracket IPA patterns - en_col_type = None if col_bracket_count: en_col_type = max(col_bracket_count, key=col_bracket_count.get) else: @@ -1105,6 +1105,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: slash_ipa_fixed = 0 for z in zones_data: for cell in z.get("cells", []): + # Only process English headword column — avoid converting + # German text like "der/die/das" to IPA. + if en_col_type and cell.get("col_type") != en_col_type: + continue text = cell.get("text", "") if "/" not in text: continue