Fix IPA correction persistence and false-positive prefix matching
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 21s
Step 5i was overwriting IPA-corrected text from Step 5c when reconstructing cells from word_boxes. Added _ipa_corrected flag to preserve corrections. Also tightened merged-token prefix matching (min prefix 4 chars, min suffix 3 chars) to prevent false positives like "sis" being extracted from "si:said". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -890,11 +890,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if cell.get("col_type") == en_col_type:
|
||||
cell["_orig_col_type"] = en_col_type
|
||||
cell["col_type"] = "column_en"
|
||||
# Snapshot text before IPA fix to detect which cells were modified
|
||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||
for cell in all_cells:
|
||||
orig = cell.pop("_orig_col_type", None)
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
# Mark cells whose text was changed by IPA correction so that
|
||||
# later steps (5i) don't overwrite the corrected text when
|
||||
# reconstructing from word_boxes.
|
||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# 5d. Fix IPA continuation cells — cells where the printed
|
||||
# phonetic transcription wraps to a line below the headword.
|
||||
@@ -1296,7 +1303,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
bullet_removed += len(to_remove)
|
||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
# Don't overwrite text that was corrected by Step 5c IPA fix
|
||||
if not cell.get("_ipa_corrected"):
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
|
||||
# Remove cells that became empty after bullet removal
|
||||
if bullet_removed:
|
||||
@@ -1477,6 +1486,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
except Exception as e:
|
||||
logger.warning("Syllable insertion failed: %s", e)
|
||||
|
||||
# Clean up internal flags before returning
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
cell.pop("_ipa_corrected", None)
|
||||
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"image_width": img_w,
|
||||
|
||||
Reference in New Issue
Block a user