Only apply IPA correction on vocabulary tables (≥3 columns)
Single-column German text pages were getting IPA inserted for words
that happen to exist in the English dictionary ("die" → [dˈaɪ],
"Das" → [dɑs]). Now IPA correction only runs when the grid has ≥3
columns, which is the minimum for a vocabulary table layout
(English | article | German).
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1165,35 +1165,36 @@ async def build_grid(session_id: str):
|
||||
|
||||
# 5c. IPA phonetic correction — replace garbled OCR phonetics with
|
||||
# correct IPA from the dictionary (same as in the OCR pipeline).
|
||||
# The grid uses generic col_types (column_1, column_2, ...) but
|
||||
# fix_cell_phonetics expects column_en / column_text. Identify
|
||||
# the English headword column (longest average text) and mark it.
|
||||
# Only applies to vocabulary tables (≥3 columns: EN | article | DE).
|
||||
# Single/two-column layouts are continuous text, not vocab tables.
|
||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||
# Find which col_type has the longest average text → English headwords
|
||||
col_avg_len: Dict[str, List[int]] = {}
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
txt = cell.get("text", "")
|
||||
col_avg_len.setdefault(ct, []).append(len(txt))
|
||||
en_col_type = None
|
||||
best_avg = 0
|
||||
for ct, lengths in col_avg_len.items():
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
avg = sum(lengths) / len(lengths) if lengths else 0
|
||||
if avg > best_avg:
|
||||
best_avg = avg
|
||||
en_col_type = ct
|
||||
if en_col_type:
|
||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
if total_cols >= 3:
|
||||
# Find which col_type has the longest average text → English headwords
|
||||
col_avg_len: Dict[str, List[int]] = {}
|
||||
for cell in all_cells:
|
||||
if cell.get("col_type") == en_col_type:
|
||||
cell["_orig_col_type"] = en_col_type
|
||||
cell["col_type"] = "column_en"
|
||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||
for cell in all_cells:
|
||||
orig = cell.pop("_orig_col_type", None)
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
ct = cell.get("col_type", "")
|
||||
txt = cell.get("text", "")
|
||||
col_avg_len.setdefault(ct, []).append(len(txt))
|
||||
en_col_type = None
|
||||
best_avg = 0
|
||||
for ct, lengths in col_avg_len.items():
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
avg = sum(lengths) / len(lengths) if lengths else 0
|
||||
if avg > best_avg:
|
||||
best_avg = avg
|
||||
en_col_type = ct
|
||||
if en_col_type:
|
||||
for cell in all_cells:
|
||||
if cell.get("col_type") == en_col_type:
|
||||
cell["_orig_col_type"] = en_col_type
|
||||
cell["col_type"] = "column_en"
|
||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||
for cell in all_cells:
|
||||
orig = cell.pop("_orig_col_type", None)
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user