Compare commits
4 Commits
9681fcbd05
...
a579c31ddb
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a579c31ddb | ||
|
|
0f9c0d2ad0 | ||
|
|
278067fe20 | ||
|
|
d76fb2a9c8 |
@@ -1250,6 +1250,32 @@ def fix_ipa_continuation_cell(
|
|||||||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||||||
return garbled_text
|
return garbled_text
|
||||||
|
|
||||||
|
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
|
||||||
|
# only generate continuation IPA for words NOT already covered.
|
||||||
|
covered_words: set = set()
|
||||||
|
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
|
||||||
|
if has_inline_ipa:
|
||||||
|
# Words before the first bracket already have their IPA shown
|
||||||
|
first_bracket = headword_text.index('[')
|
||||||
|
pre_bracket = headword_text[:first_bracket].strip()
|
||||||
|
for w in pre_bracket.split():
|
||||||
|
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
|
||||||
|
if clean and len(clean) >= 2:
|
||||||
|
covered_words.add(clean)
|
||||||
|
|
||||||
|
last_bracket_end = headword_text.rfind(']')
|
||||||
|
tail = headword_text[last_bracket_end + 1:].strip()
|
||||||
|
|
||||||
|
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
|
||||||
|
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
|
||||||
|
# — return the inline IPA directly (continuation duplicates it)
|
||||||
|
last_bracket_start = headword_text.rfind('[')
|
||||||
|
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
|
||||||
|
return inline_ipa
|
||||||
|
|
||||||
|
# Only the tail words need continuation IPA
|
||||||
|
headword_text = tail
|
||||||
|
|
||||||
# Strip existing IPA brackets and parenthetical grammar annotations
|
# Strip existing IPA brackets and parenthetical grammar annotations
|
||||||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||||||
@@ -1270,6 +1296,7 @@ def fix_ipa_continuation_cell(
|
|||||||
# Do NOT skip grammar words here — they are integral parts of the
|
# Do NOT skip grammar words here — they are integral parts of the
|
||||||
# headword (e.g. "close down", "the United Kingdom"). Grammar
|
# headword (e.g. "close down", "the United Kingdom"). Grammar
|
||||||
# annotations like "(sth)", "(no pl)" are already stripped above.
|
# annotations like "(sth)", "(no pl)" are already stripped above.
|
||||||
|
# Skip words that already have inline IPA in the headword row.
|
||||||
ipa_parts: List[str] = []
|
ipa_parts: List[str] = []
|
||||||
for part in parts:
|
for part in parts:
|
||||||
# A part may be multi-word like "secondary school"
|
# A part may be multi-word like "secondary school"
|
||||||
@@ -1279,6 +1306,8 @@ def fix_ipa_continuation_cell(
|
|||||||
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||||
if not clean_w or len(clean_w) < 2:
|
if not clean_w or len(clean_w) < 2:
|
||||||
continue
|
continue
|
||||||
|
if covered_words and clean_w.lower() in covered_words:
|
||||||
|
continue # Already has IPA inline in the headword
|
||||||
ipa = _lookup_ipa(clean_w, pronunciation)
|
ipa = _lookup_ipa(clean_w, pronunciation)
|
||||||
if ipa:
|
if ipa:
|
||||||
word_ipas.append(ipa)
|
word_ipas.append(ipa)
|
||||||
|
|||||||
@@ -1797,6 +1797,12 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if not ct.startswith("column_"):
|
if not ct.startswith("column_"):
|
||||||
continue
|
continue
|
||||||
cell_text = (cell.get("text") or "").strip()
|
cell_text = (cell.get("text") or "").strip()
|
||||||
|
if not cell_text:
|
||||||
|
# Step 5c may have emptied garbled IPA cells like
|
||||||
|
# "[n, nn]" — recover text from word_boxes.
|
||||||
|
wb_texts = [w.get("text", "")
|
||||||
|
for w in cell.get("word_boxes", [])]
|
||||||
|
cell_text = " ".join(wb_texts).strip()
|
||||||
if not cell_text:
|
if not cell_text:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -1877,11 +1883,15 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if stripped and stripped != text:
|
if stripped and stripped != text:
|
||||||
cell["text"] = stripped
|
cell["text"] = stripped
|
||||||
|
|
||||||
# 5g. Extract page_ref rows and footer rows from content zones.
|
# 5g. Extract page_ref cells and footer rows from content zones.
|
||||||
# Page references (column_1 cells like "p.70") and footer lines
|
# Page references (column_1 cells like "p.70") sit in rows that
|
||||||
# (e.g. "two hundred and twelve" = page number) should not be part
|
# also contain vocabulary — extract them as zone metadata without
|
||||||
# of the vocabulary table. Move them to zone-level metadata so the
|
# removing the row. Footer lines (e.g. "two hundred and twelve"
|
||||||
# frontend can display them separately.
|
# = page number at bottom) are standalone rows that should be
|
||||||
|
# removed from the table entirely.
|
||||||
|
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||||
|
# Page-ref pattern: "p.70", "P.70", ",.65" (garbled "p"), or bare "70"
|
||||||
|
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
if z.get("zone_type") != "content":
|
if z.get("zone_type") != "content":
|
||||||
continue
|
continue
|
||||||
@@ -1890,53 +1900,61 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if not rows:
|
if not rows:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Extract column_1 cells that look like page references
|
||||||
page_refs = []
|
page_refs = []
|
||||||
footer_rows = []
|
page_ref_cell_ids = set()
|
||||||
|
for cell in cells:
|
||||||
# Detect page_ref rows: rows where the ONLY cell is column_1
|
if cell.get("col_type") != "column_1":
|
||||||
# (just a page number like "p.65", "p.70")
|
continue
|
||||||
for row in rows:
|
text = (cell.get("text") or "").strip()
|
||||||
if row.get("is_header"):
|
if not text:
|
||||||
|
continue
|
||||||
|
if not _PAGE_REF_RE.match(text):
|
||||||
continue
|
continue
|
||||||
ri = row["index"]
|
|
||||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
||||||
if (len(row_cells) == 1
|
|
||||||
and row_cells[0].get("col_type") == "column_1"):
|
|
||||||
page_refs.append({
|
page_refs.append({
|
||||||
"row_index": ri,
|
"row_index": cell.get("row_index"),
|
||||||
"text": (row_cells[0].get("text") or "").strip(),
|
"text": text,
|
||||||
"bbox_pct": row_cells[0].get("bbox_pct", {}),
|
"bbox_pct": cell.get("bbox_pct", {}),
|
||||||
})
|
})
|
||||||
|
page_ref_cell_ids.add(cell.get("cell_id"))
|
||||||
|
|
||||||
# Detect footer: last non-header row if it has only 1 content
|
# Remove page_ref cells from the table (but keep their rows)
|
||||||
# cell and no column_1 page_ref (standalone text like page num)
|
if page_ref_cell_ids:
|
||||||
|
z["cells"] = [c for c in z["cells"]
|
||||||
|
if c.get("cell_id") not in page_ref_cell_ids]
|
||||||
|
|
||||||
|
# Detect footer: last non-header row if it has only 1 cell
|
||||||
|
# and the text is NOT IPA (no real IPA Unicode symbols).
|
||||||
|
# This catches page numbers like "two hundred and twelve".
|
||||||
|
footer_rows = []
|
||||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||||
if non_header_rows:
|
if non_header_rows:
|
||||||
last_row = non_header_rows[-1]
|
last_row = non_header_rows[-1]
|
||||||
last_ri = last_row["index"]
|
last_ri = last_row["index"]
|
||||||
last_cells = [c for c in cells if c.get("row_index") == last_ri]
|
last_cells = [c for c in z["cells"]
|
||||||
content_last = [
|
if c.get("row_index") == last_ri]
|
||||||
c for c in last_cells
|
if len(last_cells) == 1:
|
||||||
if c.get("col_type", "").startswith("column_")
|
text = (last_cells[0].get("text") or "").strip()
|
||||||
and c.get("col_type") != "column_1"
|
# Not IPA (no real IPA symbols) and not a heading
|
||||||
]
|
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
||||||
if len(content_last) == 1 and len(last_cells) == 1:
|
if text and not has_real_ipa and last_cells[0].get("col_type") != "heading":
|
||||||
footer_rows.append({
|
footer_rows.append({
|
||||||
"row_index": last_ri,
|
"row_index": last_ri,
|
||||||
"text": (content_last[0].get("text") or "").strip(),
|
"text": text,
|
||||||
"bbox_pct": content_last[0].get("bbox_pct", {}),
|
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Remove page_ref and footer cells/rows from the table
|
# Mark footer rows (keep in table, just tag for frontend)
|
||||||
remove_ris = set()
|
if footer_rows:
|
||||||
for pr in page_refs:
|
footer_ris = {fr["row_index"] for fr in footer_rows}
|
||||||
remove_ris.add(pr["row_index"])
|
for r in z["rows"]:
|
||||||
for fr in footer_rows:
|
if r["index"] in footer_ris:
|
||||||
remove_ris.add(fr["row_index"])
|
r["is_footer"] = True
|
||||||
|
for c in z["cells"]:
|
||||||
|
if c.get("row_index") in footer_ris:
|
||||||
|
c["col_type"] = "footer"
|
||||||
|
|
||||||
if remove_ris:
|
if page_refs or footer_rows:
|
||||||
z["cells"] = [c for c in cells if c.get("row_index") not in remove_ris]
|
|
||||||
z["rows"] = [r for r in rows if r["index"] not in remove_ris]
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Extracted %d page_refs + %d footer rows from zone %d",
|
"Extracted %d page_refs + %d footer rows from zone %d",
|
||||||
len(page_refs), len(footer_rows), z.get("zone_index", 0),
|
len(page_refs), len(footer_rows), z.get("zone_index", 0),
|
||||||
|
|||||||
@@ -510,6 +510,23 @@ class TestGarbledIpaDetection:
|
|||||||
assert "klˈəʊs" in fixed # close IPA
|
assert "klˈəʊs" in fixed # close IPA
|
||||||
assert "dˈaʊn" in fixed # down IPA — must NOT be skipped
|
assert "dˈaʊn" in fixed # down IPA — must NOT be skipped
|
||||||
|
|
||||||
|
def test_continuation_skips_words_with_inline_ipa(self):
|
||||||
|
"""'beat [bˈiːt] , beat, beaten' → continuation only for 'beaten'."""
|
||||||
|
fixed = fix_ipa_continuation_cell(
|
||||||
|
"[bi:tan]", "beat [bˈiːt] , beat, beaten", pronunciation="british",
|
||||||
|
)
|
||||||
|
# Should only have IPA for "beaten", NOT for "beat" (already inline)
|
||||||
|
assert "bˈiːtən" in fixed
|
||||||
|
assert fixed.count("bˈiːt") == 0 or fixed == "[bˈiːtən]"
|
||||||
|
|
||||||
|
def test_continuation_bracket_at_end_returns_inline(self):
|
||||||
|
"""'the Highlands [ˈhaɪləndz]' → return inline IPA, not IPA for 'the'."""
|
||||||
|
fixed = fix_ipa_continuation_cell(
|
||||||
|
"'hailandz", "the Highlands [ˈhaɪləndz]", pronunciation="british",
|
||||||
|
)
|
||||||
|
assert fixed == "[ˈhaɪləndz]"
|
||||||
|
assert "ðə" not in fixed # "the" must NOT get IPA
|
||||||
|
|
||||||
def test_headword_with_brackets_not_continuation(self):
|
def test_headword_with_brackets_not_continuation(self):
|
||||||
"""'employee [im'ploi:]' has a headword outside brackets → not garbled.
|
"""'employee [im'ploi:]' has a headword outside brackets → not garbled.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user