Compare commits
2 Commits
4feec7c7b7
...
c42924a94a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c42924a94a | ||
|
|
9ea217bdfc |
@@ -1030,6 +1030,15 @@ def _text_has_garbled_ipa(text: str) -> bool:
|
|||||||
# Contains IPA special characters
|
# Contains IPA special characters
|
||||||
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
|
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||||
return True
|
return True
|
||||||
|
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
|
||||||
|
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
|
||||||
|
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
|
||||||
|
# chars to avoid contractions (don't, won't, o'clock).
|
||||||
|
if "'" in w and not w.startswith("'"):
|
||||||
|
apos_idx = w.index("'")
|
||||||
|
after = w[apos_idx + 1:]
|
||||||
|
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@@ -1183,6 +1192,19 @@ def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
|||||||
if _lookup_ipa(clean_j, pronunciation):
|
if _lookup_ipa(clean_j, pronunciation):
|
||||||
kept.extend(words[j:])
|
kept.extend(words[j:])
|
||||||
break
|
break
|
||||||
|
# Merged token: dictionary word + garbled IPA stuck together.
|
||||||
|
# E.g. "fictionsalans'fIkfn" starts with "fiction".
|
||||||
|
# Extract the dictionary prefix (≥4 chars) and add it with
|
||||||
|
# IPA, but only if enough chars remain after the prefix (≥3)
|
||||||
|
# to look like garbled IPA, not just a plural 's'.
|
||||||
|
if clean_j and len(clean_j) >= 7:
|
||||||
|
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
|
||||||
|
prefix_j = clean_j[:pend]
|
||||||
|
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
|
||||||
|
if prefix_ipa:
|
||||||
|
kept.append(f"{prefix_j} [{prefix_ipa}]")
|
||||||
|
break
|
||||||
|
break # rest of this token is garbled
|
||||||
# Otherwise — likely garbled phonetics, skip
|
# Otherwise — likely garbled phonetics, skip
|
||||||
words = kept
|
words = kept
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -858,6 +858,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
# Single/two-column layouts are continuous text, not vocab tables.
|
# Single/two-column layouts are continuous text, not vocab tables.
|
||||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||||
|
en_col_type = None
|
||||||
if total_cols >= 3:
|
if total_cols >= 3:
|
||||||
# Find the column that contains IPA brackets → English headwords.
|
# Find the column that contains IPA brackets → English headwords.
|
||||||
# Count cells with bracket patterns per col_type. The column with
|
# Count cells with bracket patterns per col_type. The column with
|
||||||
@@ -872,7 +873,6 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if ct.startswith("column_") and '[' in txt:
|
if ct.startswith("column_") and '[' in txt:
|
||||||
col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
|
col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
|
||||||
# Pick column with most bracket IPA patterns
|
# Pick column with most bracket IPA patterns
|
||||||
en_col_type = None
|
|
||||||
if col_bracket_count:
|
if col_bracket_count:
|
||||||
en_col_type = max(col_bracket_count, key=col_bracket_count.get)
|
en_col_type = max(col_bracket_count, key=col_bracket_count.get)
|
||||||
else:
|
else:
|
||||||
@@ -890,11 +890,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if cell.get("col_type") == en_col_type:
|
if cell.get("col_type") == en_col_type:
|
||||||
cell["_orig_col_type"] = en_col_type
|
cell["_orig_col_type"] = en_col_type
|
||||||
cell["col_type"] = "column_en"
|
cell["col_type"] = "column_en"
|
||||||
|
# Snapshot text before IPA fix to detect which cells were modified
|
||||||
|
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||||
for cell in all_cells:
|
for cell in all_cells:
|
||||||
orig = cell.pop("_orig_col_type", None)
|
orig = cell.pop("_orig_col_type", None)
|
||||||
if orig:
|
if orig:
|
||||||
cell["col_type"] = orig
|
cell["col_type"] = orig
|
||||||
|
# Mark cells whose text was changed by IPA correction so that
|
||||||
|
# later steps (5i) don't overwrite the corrected text when
|
||||||
|
# reconstructing from word_boxes.
|
||||||
|
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||||
|
cell["_ipa_corrected"] = True
|
||||||
|
|
||||||
# 5d. Fix IPA continuation cells — cells where the printed
|
# 5d. Fix IPA continuation cells — cells where the printed
|
||||||
# phonetic transcription wraps to a line below the headword.
|
# phonetic transcription wraps to a line below the headword.
|
||||||
@@ -1105,6 +1112,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
slash_ipa_fixed = 0
|
slash_ipa_fixed = 0
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
for cell in z.get("cells", []):
|
for cell in z.get("cells", []):
|
||||||
|
# Only process English headword column — avoid converting
|
||||||
|
# German text like "der/die/das" to IPA.
|
||||||
|
if en_col_type and cell.get("col_type") != en_col_type:
|
||||||
|
continue
|
||||||
text = cell.get("text", "")
|
text = cell.get("text", "")
|
||||||
if "/" not in text:
|
if "/" not in text:
|
||||||
continue
|
continue
|
||||||
@@ -1292,7 +1303,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
bullet_removed += len(to_remove)
|
bullet_removed += len(to_remove)
|
||||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||||
cell["word_boxes"] = filtered
|
cell["word_boxes"] = filtered
|
||||||
cell["text"] = _words_to_reading_order_text(filtered)
|
# Don't overwrite text that was corrected by Step 5c IPA fix
|
||||||
|
if not cell.get("_ipa_corrected"):
|
||||||
|
cell["text"] = _words_to_reading_order_text(filtered)
|
||||||
|
|
||||||
# Remove cells that became empty after bullet removal
|
# Remove cells that became empty after bullet removal
|
||||||
if bullet_removed:
|
if bullet_removed:
|
||||||
@@ -1473,6 +1486,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Syllable insertion failed: %s", e)
|
logger.warning("Syllable insertion failed: %s", e)
|
||||||
|
|
||||||
|
# Clean up internal flags before returning
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
cell.pop("_ipa_corrected", None)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"session_id": session_id,
|
"session_id": session_id,
|
||||||
"image_width": img_w,
|
"image_width": img_w,
|
||||||
|
|||||||
Reference in New Issue
Block a user