Fix false headers on sparse layouts and IPA corruption on German text

1. Header detection: Add 25% cap to single-cell heading heuristic. On German synonym dicts where most rows naturally have only 1 content cell, the old logic marked 60%+ of rows as headers. 2. IPA de/all mode: Use "column_text" (light processing) for non- English columns instead of "column_en" (full processing). The full path runs _insert_missing_ipa() which splits on whitespace, matches English prefixes ("bildschön" → "bild"), and truncates the rest — destroying German comma-separated synonym lists. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 21:49:05 +01:00
parent 256df820cd
commit 76cd1ac020
2 changed files with 24 additions and 3 deletions
@@ -914,9 +914,18 @@ async def _build_grid_core(
        if ipa_target_cols:
            for cell in all_cells:
-                if cell.get("col_type") in ipa_target_cols:
+                ct = cell.get("col_type")
-                    cell["_orig_col_type"] = cell["col_type"]
+                if ct in ipa_target_cols:
-                    cell["col_type"] = "column_en"
+                    cell["_orig_col_type"] = ct
                    # Full IPA processing (incl. insertion) only for the
                    # detected English column; other columns get light
                    # processing (bracket replacement only) — our IPA
                    # dictionary is English-only, so inserting IPA into
                    # German text would corrupt it.
                    if ct == en_col_type:
                        cell["col_type"] = "column_en"
                    else:
                        cell["col_type"] = "column_text"
        # Snapshot text before IPA fix to detect which cells were modified
        _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
        fix_cell_phonetics(all_cells, pronunciation="british")
@@ -913,6 +913,18 @@ def _detect_heading_rows_by_single_cell(
                continue
            heading_row_indices.append(ri)
        # Guard: if >25% of eligible rows would become headings, the
        # heuristic is misfiring (e.g. sparse single-column layout where
        # most rows naturally have only 1 content cell).
        eligible_rows = len(non_header_rows) - 2  # minus first/last excluded
        if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
            logger.debug(
                "Skipping single-cell heading detection for zone %s: "
                "%d/%d rows would be headings (>25%%)",
                z.get("zone_index"), len(heading_row_indices), eligible_rows,
            )
            continue
        for hri in heading_row_indices:
            header_cells = [c for c in cells if c.get("row_index") == hri]
            if not header_cells: