Preserve IPA continuation rows in grid output
Stop removing rows that contain only phonetic transcription below the headword. These rows are valid content that users need to see. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1324,64 +1324,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if orig:
|
if orig:
|
||||||
cell["col_type"] = orig
|
cell["col_type"] = orig
|
||||||
|
|
||||||
# 5d. Remove IPA continuation rows — rows where the printed
|
# 5d. IPA continuation rows are preserved — they contain the
|
||||||
# phonetic transcription wraps to a line below the headword.
|
# printed phonetic transcription that wraps to a line below the
|
||||||
# These rows have text only in the English column (+ margin
|
# headword. The user can manually delete them if not needed.
|
||||||
# noise) and fix_cell_phonetics did NOT insert IPA brackets
|
|
||||||
# (because there's no real English word to look up).
|
|
||||||
ipa_cont_rows: set = set()
|
|
||||||
for z in zones_data:
|
|
||||||
for row in z.get("rows", []):
|
|
||||||
ri = row["index"]
|
|
||||||
row_cells = [
|
|
||||||
c for c in z.get("cells", [])
|
|
||||||
if c.get("row_index") == ri
|
|
||||||
]
|
|
||||||
en_cells = [
|
|
||||||
c for c in row_cells
|
|
||||||
if c.get("col_type") == en_col_type
|
|
||||||
]
|
|
||||||
# Other cells with ≥3 chars (ignore margin noise)
|
|
||||||
other_cells = [
|
|
||||||
c for c in row_cells
|
|
||||||
if c.get("col_type") != en_col_type
|
|
||||||
and len((c.get("text") or "").strip()) >= 3
|
|
||||||
]
|
|
||||||
if en_cells and not other_cells:
|
|
||||||
en_text = en_cells[0].get("text", "")
|
|
||||||
# Strip any IPA brackets that fix_cell_phonetics
|
|
||||||
# may have added for short dictionary matches
|
|
||||||
# (e.g. "si" → "[si]") to check underlying text.
|
|
||||||
text_bare = re.sub(r'\[[^\]]*\]', '', en_text).strip()
|
|
||||||
# Garbled IPA typically contains ':' (length mark)
|
|
||||||
# or starts with ' (stress mark), and has no word
|
|
||||||
# with ≥3 letters that could be a real headword.
|
|
||||||
has_headword = any(
|
|
||||||
len(re.sub(r'[^a-zA-Z]', '', w)) >= 3
|
|
||||||
for w in text_bare.split()
|
|
||||||
) if text_bare else False
|
|
||||||
looks_phonetic = (
|
|
||||||
':' in text_bare
|
|
||||||
or text_bare.startswith("'")
|
|
||||||
or text_bare.startswith("\u2019")
|
|
||||||
or not has_headword
|
|
||||||
)
|
|
||||||
if looks_phonetic:
|
|
||||||
ipa_cont_rows.add(ri)
|
|
||||||
if ipa_cont_rows:
|
|
||||||
for z in zones_data:
|
|
||||||
z["rows"] = [
|
|
||||||
r for r in z.get("rows", [])
|
|
||||||
if r["index"] not in ipa_cont_rows
|
|
||||||
]
|
|
||||||
z["cells"] = [
|
|
||||||
c for c in z.get("cells", [])
|
|
||||||
if c.get("row_index") not in ipa_cont_rows
|
|
||||||
]
|
|
||||||
logger.info(
|
|
||||||
"removed %d IPA continuation rows: %s",
|
|
||||||
len(ipa_cont_rows), sorted(ipa_cont_rows),
|
|
||||||
)
|
|
||||||
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user