Preserve IPA continuation rows in grid output

Stop removing rows that contain only phonetic transcription below the headword. These rows are valid content that users need to see. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 10:22:58 +01:00
parent 038eaf783c
commit 050d410ba0
1 changed files with 3 additions and 58 deletions
@@ -1324,64 +1324,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
            if orig:
                cell["col_type"] = orig
-        # 5d. Remove IPA continuation rows — rows where the printed
+        # 5d. IPA continuation rows are preserved — they contain the
-        # phonetic transcription wraps to a line below the headword.
+        # printed phonetic transcription that wraps to a line below the
-        # These rows have text only in the English column (+ margin
+        # headword.  The user can manually delete them if not needed.
        # noise) and fix_cell_phonetics did NOT insert IPA brackets
        # (because there's no real English word to look up).
        ipa_cont_rows: set = set()
        for z in zones_data:
            for row in z.get("rows", []):
                ri = row["index"]
                row_cells = [
                    c for c in z.get("cells", [])
                    if c.get("row_index") == ri
                ]
                en_cells = [
                    c for c in row_cells
                    if c.get("col_type") == en_col_type
                ]
                # Other cells with ≥3 chars (ignore margin noise)
                other_cells = [
                    c for c in row_cells
                    if c.get("col_type") != en_col_type
                    and len((c.get("text") or "").strip()) >= 3
                ]
                if en_cells and not other_cells:
                    en_text = en_cells[0].get("text", "")
                    # Strip any IPA brackets that fix_cell_phonetics
                    # may have added for short dictionary matches
                    # (e.g. "si" → "[si]") to check underlying text.
                    text_bare = re.sub(r'\[[^\]]*\]', '', en_text).strip()
                    # Garbled IPA typically contains ':' (length mark)
                    # or starts with ' (stress mark), and has no word
                    # with ≥3 letters that could be a real headword.
                    has_headword = any(
                        len(re.sub(r'[^a-zA-Z]', '', w)) >= 3
                        for w in text_bare.split()
                    ) if text_bare else False
                    looks_phonetic = (
                        ':' in text_bare
                        or text_bare.startswith("'")
                        or text_bare.startswith("\u2019")
                        or not has_headword
                    )
                    if looks_phonetic:
                        ipa_cont_rows.add(ri)
        if ipa_cont_rows:
            for z in zones_data:
                z["rows"] = [
                    r for r in z.get("rows", [])
                    if r["index"] not in ipa_cont_rows
                ]
                z["cells"] = [
                    c for c in z.get("cells", [])
                    if c.get("row_index") not in ipa_cont_rows
                ]
            logger.info(
                "removed %d IPA continuation rows: %s",
                len(ipa_cont_rows), sorted(ipa_cont_rows),
            )
    duration = time.time() - t0