Fix IPA correction persistence and false-positive prefix matching
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 21s

Step 5i was overwriting IPA-corrected text from Step 5c when
reconstructing cells from word_boxes. Added _ipa_corrected flag
to preserve corrections. Also tightened merged-token prefix matching
(min prefix 4 chars, min suffix 3 chars) to prevent false positives
like "sis" being extracted from "si:said".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-25 07:26:32 +01:00
parent 9ea217bdfc
commit c42924a94a
2 changed files with 20 additions and 4 deletions

View File

@@ -1194,9 +1194,11 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
break break
# Merged token: dictionary word + garbled IPA stuck together. # Merged token: dictionary word + garbled IPA stuck together.
# E.g. "fictionsalans'fIkfn" starts with "fiction". # E.g. "fictionsalans'fIkfn" starts with "fiction".
# Extract the dictionary prefix and add it with IPA. # Extract the dictionary prefix (≥4 chars) and add it with
if clean_j and len(clean_j) >= 5: # IPA, but only if enough chars remain after the prefix (≥3)
for pend in range(min(len(clean_j), 15), 2, -1): # to look like garbled IPA, not just a plural 's'.
if clean_j and len(clean_j) >= 7:
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
prefix_j = clean_j[:pend] prefix_j = clean_j[:pend]
prefix_ipa = _lookup_ipa(prefix_j, pronunciation) prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
if prefix_ipa: if prefix_ipa:

View File

@@ -890,11 +890,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if cell.get("col_type") == en_col_type: if cell.get("col_type") == en_col_type:
cell["_orig_col_type"] = en_col_type cell["_orig_col_type"] = en_col_type
cell["col_type"] = "column_en" cell["col_type"] = "column_en"
# Snapshot text before IPA fix to detect which cells were modified
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
fix_cell_phonetics(all_cells, pronunciation="british") fix_cell_phonetics(all_cells, pronunciation="british")
for cell in all_cells: for cell in all_cells:
orig = cell.pop("_orig_col_type", None) orig = cell.pop("_orig_col_type", None)
if orig: if orig:
cell["col_type"] = orig cell["col_type"] = orig
# Mark cells whose text was changed by IPA correction so that
# later steps (5i) don't overwrite the corrected text when
# reconstructing from word_boxes.
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# 5d. Fix IPA continuation cells — cells where the printed # 5d. Fix IPA continuation cells — cells where the printed
# phonetic transcription wraps to a line below the headword. # phonetic transcription wraps to a line below the headword.
@@ -1296,6 +1303,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
bullet_removed += len(to_remove) bullet_removed += len(to_remove)
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
cell["word_boxes"] = filtered cell["word_boxes"] = filtered
# Don't overwrite text that was corrected by Step 5c IPA fix
if not cell.get("_ipa_corrected"):
cell["text"] = _words_to_reading_order_text(filtered) cell["text"] = _words_to_reading_order_text(filtered)
# Remove cells that became empty after bullet removal # Remove cells that became empty after bullet removal
@@ -1477,6 +1486,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
except Exception as e: except Exception as e:
logger.warning("Syllable insertion failed: %s", e) logger.warning("Syllable insertion failed: %s", e)
# Clean up internal flags before returning
for z in zones_data:
for cell in z.get("cells", []):
cell.pop("_ipa_corrected", None)
result = { result = {
"session_id": session_id, "session_id": session_id,
"image_width": img_w, "image_width": img_w,