Add IPA and syllable mode toggles, fix false IPA on German documents
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Backend: Remove en_col_type fallback heuristic (longest avg text) that incorrectly identified German columns as English. IPA now only applied when OCR bracket patterns are actually found. Add ipa_mode (auto/all/none) and syllable_mode (auto/all/none) query params to build-grid API. Frontend: Add IPA and Silben dropdown selects to GridToolbar. Modes are passed as query params on rebuild. Auto = current smart detection, All = force for all words, Aus = skip entirely. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -194,6 +194,8 @@ def insert_syllable_dividers(
|
||||
zones_data: List[Dict],
|
||||
img_bgr: np.ndarray,
|
||||
session_id: str,
|
||||
*,
|
||||
force: bool = False,
|
||||
) -> int:
|
||||
"""Insert pipe syllable dividers into dictionary cells.
|
||||
|
||||
@@ -204,6 +206,10 @@ def insert_syllable_dividers(
|
||||
OCR. This guards against pages with zero pipe characters (the primary
|
||||
guard — article_col_index — is checked at the call site).
|
||||
|
||||
Args:
|
||||
force: If True, skip the pipe-ratio pre-check and syllabify all
|
||||
content words regardless of whether the original has pipe dividers.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
hyph_de, hyph_en = _get_hyphenators()
|
||||
@@ -215,24 +221,25 @@ def insert_syllable_dividers(
|
||||
# Real dictionary pages with printed syllable dividers will have OCR-
|
||||
# detected pipes in many cells. Pages without syllable dividers will
|
||||
# have zero — skip those to avoid false syllabification.
|
||||
total_col_cells = 0
|
||||
cells_with_pipes = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
if cell.get("col_type", "").startswith("column_"):
|
||||
total_col_cells += 1
|
||||
if "|" in cell.get("text", ""):
|
||||
cells_with_pipes += 1
|
||||
if not force:
|
||||
total_col_cells = 0
|
||||
cells_with_pipes = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
if cell.get("col_type", "").startswith("column_"):
|
||||
total_col_cells += 1
|
||||
if "|" in cell.get("text", ""):
|
||||
cells_with_pipes += 1
|
||||
|
||||
if total_col_cells > 0:
|
||||
pipe_ratio = cells_with_pipes / total_col_cells
|
||||
if pipe_ratio < 0.01:
|
||||
logger.info(
|
||||
"build-grid session %s: skipping syllable insertion — "
|
||||
"only %.1f%% of cells have existing pipes (need >=1%%)",
|
||||
session_id, pipe_ratio * 100,
|
||||
)
|
||||
return 0
|
||||
if total_col_cells > 0:
|
||||
pipe_ratio = cells_with_pipes / total_col_cells
|
||||
if pipe_ratio < 0.01:
|
||||
logger.info(
|
||||
"build-grid session %s: skipping syllable insertion — "
|
||||
"only %.1f%% of cells have existing pipes (need >=1%%)",
|
||||
session_id, pipe_ratio * 100,
|
||||
)
|
||||
return 0
|
||||
|
||||
insertions = 0
|
||||
for z in zones_data:
|
||||
|
||||
Reference in New Issue
Block a user