Fix false headers on sparse layouts and IPA corruption on German text
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 17s
1. Header detection: Add 25% cap to single-cell heading heuristic.
On German synonym dicts where most rows naturally have only 1
content cell, the old logic marked 60%+ of rows as headers.
2. IPA de/all mode: Use "column_text" (light processing) for non-
English columns instead of "column_en" (full processing). The
full path runs _insert_missing_ipa() which splits on whitespace,
matches English prefixes ("bildschön" → "bild"), and truncates
the rest — destroying German comma-separated synonym lists.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -914,9 +914,18 @@ async def _build_grid_core(
|
|||||||
|
|
||||||
if ipa_target_cols:
|
if ipa_target_cols:
|
||||||
for cell in all_cells:
|
for cell in all_cells:
|
||||||
if cell.get("col_type") in ipa_target_cols:
|
ct = cell.get("col_type")
|
||||||
cell["_orig_col_type"] = cell["col_type"]
|
if ct in ipa_target_cols:
|
||||||
cell["col_type"] = "column_en"
|
cell["_orig_col_type"] = ct
|
||||||
|
# Full IPA processing (incl. insertion) only for the
|
||||||
|
# detected English column; other columns get light
|
||||||
|
# processing (bracket replacement only) — our IPA
|
||||||
|
# dictionary is English-only, so inserting IPA into
|
||||||
|
# German text would corrupt it.
|
||||||
|
if ct == en_col_type:
|
||||||
|
cell["col_type"] = "column_en"
|
||||||
|
else:
|
||||||
|
cell["col_type"] = "column_text"
|
||||||
# Snapshot text before IPA fix to detect which cells were modified
|
# Snapshot text before IPA fix to detect which cells were modified
|
||||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||||
|
|||||||
@@ -913,6 +913,18 @@ def _detect_heading_rows_by_single_cell(
|
|||||||
continue
|
continue
|
||||||
heading_row_indices.append(ri)
|
heading_row_indices.append(ri)
|
||||||
|
|
||||||
|
# Guard: if >25% of eligible rows would become headings, the
|
||||||
|
# heuristic is misfiring (e.g. sparse single-column layout where
|
||||||
|
# most rows naturally have only 1 content cell).
|
||||||
|
eligible_rows = len(non_header_rows) - 2 # minus first/last excluded
|
||||||
|
if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
|
||||||
|
logger.debug(
|
||||||
|
"Skipping single-cell heading detection for zone %s: "
|
||||||
|
"%d/%d rows would be headings (>25%%)",
|
||||||
|
z.get("zone_index"), len(heading_row_indices), eligible_rows,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
for hri in heading_row_indices:
|
for hri in heading_row_indices:
|
||||||
header_cells = [c for c in cells if c.get("row_index") == hri]
|
header_cells = [c for c in cells if c.get("row_index") == hri]
|
||||||
if not header_cells:
|
if not header_cells:
|
||||||
|
|||||||
Reference in New Issue
Block a user