Fix IPA correction for dictionary pages (WIP)

- Fix Step 5h: restrict slash-IPA conversion to English headword column
  only — prevents converting "der/die/das" to "der [dər]das" in German
  columns (confirmed working)
- Fix _text_has_garbled_ipa: detect embedded apostrophes in merged
  tokens like "Scotland'skotland" where OCR reads ˈ as '
- Fix _insert_missing_ipa: detect dictionary word prefix in merged
  trailing tokens like "fictionsalans'fIkfn" → extract "fiction" with IPA
- Move en_col_type to wider scope for Step 5h access

Note: Fixes 1+2 confirmed working in unit tests but not yet applying
in the full build-grid pipeline — needs further debugging.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 23:54:14 +01:00
parent 4feec7c7b7
commit 9ea217bdfc
2 changed files with 25 additions and 1 deletions

View File

@@ -1030,6 +1030,15 @@ def _text_has_garbled_ipa(text: str) -> bool:
# Contains IPA special characters # Contains IPA special characters
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'): if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
return True return True
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
# chars to avoid contractions (don't, won't, o'clock).
if "'" in w and not w.startswith("'"):
apos_idx = w.index("'")
after = w[apos_idx + 1:]
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
return True
return False return False
@@ -1183,6 +1192,17 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
if _lookup_ipa(clean_j, pronunciation): if _lookup_ipa(clean_j, pronunciation):
kept.extend(words[j:]) kept.extend(words[j:])
break break
# Merged token: dictionary word + garbled IPA stuck together.
# E.g. "fictionsalans'fIkfn" starts with "fiction".
# Extract the dictionary prefix and add it with IPA.
if clean_j and len(clean_j) >= 5:
for pend in range(min(len(clean_j), 15), 2, -1):
prefix_j = clean_j[:pend]
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
if prefix_ipa:
kept.append(f"{prefix_j} [{prefix_ipa}]")
break
break # rest of this token is garbled
# Otherwise — likely garbled phonetics, skip # Otherwise — likely garbled phonetics, skip
words = kept words = kept
break break

View File

@@ -858,6 +858,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# Single/two-column layouts are continuous text, not vocab tables. # Single/two-column layouts are continuous text, not vocab tables.
all_cells = [cell for z in zones_data for cell in z.get("cells", [])] all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
total_cols = sum(len(z.get("columns", [])) for z in zones_data) total_cols = sum(len(z.get("columns", [])) for z in zones_data)
en_col_type = None
if total_cols >= 3: if total_cols >= 3:
# Find the column that contains IPA brackets → English headwords. # Find the column that contains IPA brackets → English headwords.
# Count cells with bracket patterns per col_type. The column with # Count cells with bracket patterns per col_type. The column with
@@ -872,7 +873,6 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if ct.startswith("column_") and '[' in txt: if ct.startswith("column_") and '[' in txt:
col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1 col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
# Pick column with most bracket IPA patterns # Pick column with most bracket IPA patterns
en_col_type = None
if col_bracket_count: if col_bracket_count:
en_col_type = max(col_bracket_count, key=col_bracket_count.get) en_col_type = max(col_bracket_count, key=col_bracket_count.get)
else: else:
@@ -1105,6 +1105,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
slash_ipa_fixed = 0 slash_ipa_fixed = 0
for z in zones_data: for z in zones_data:
for cell in z.get("cells", []): for cell in z.get("cells", []):
# Only process English headword column — avoid converting
# German text like "der/die/das" to IPA.
if en_col_type and cell.get("col_type") != en_col_type:
continue
text = cell.get("text", "") text = cell.get("text", "")
if "/" not in text: if "/" not in text:
continue continue