Add language-specific IPA and syllable modes (de/en)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s

Extend ipa_mode and syllable_mode toggles with language options:
- auto: smart detection (default)
- en: only English headword column
- de: only German definition columns
- all: all content columns
- none: skip entirely

Also improve English column auto-detection: use garbled IPA patterns
(apostrophes, colons) in addition to bracket patterns. This correctly
identifies English dictionary pages where OCR produces garbled ASCII
instead of bracket IPA.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-25 08:16:29 +01:00
parent 34680732f8
commit 83c058e400
4 changed files with 68 additions and 31 deletions

View File

@@ -80,9 +80,11 @@ async def _build_grid_core(
session_id: Session identifier (for logging and image loading).
session: Full session dict from get_session_db().
ipa_mode: "auto" (only when English headwords detected), "all"
(force IPA on all content columns), or "none" (skip IPA entirely).
(force IPA on all content columns), "en" (English column only),
"de" (German/definition columns only), or "none" (skip entirely).
syllable_mode: "auto" (only when original has pipe dividers),
"all" (force syllabification on all words), or "none" (skip).
"all" (force syllabification on all words), "en" (English only),
"de" (German only), or "none" (skip).
Returns:
StructuredGrid result dict.
@@ -869,32 +871,51 @@ async def _build_grid_core(
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
en_col_type = None
ipa_target_cols: set = set()
all_content_cols: set = set()
skip_ipa = (ipa_mode == "none")
if not skip_ipa and total_cols >= 3:
# Find the column that contains IPA brackets → English headwords.
# Count cells with bracket patterns per col_type. The column with
# the most brackets is the headword column (IPA sits after or below
# headwords).
col_bracket_count: Dict[str, int] = {}
# Detect English headword column via IPA signals (brackets or garbled).
col_ipa_count: Dict[str, int] = {}
all_content_cols: set = set()
for cell in all_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
txt = cell.get("text", "") or ""
if ct.startswith("column_") and '[' in txt:
col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
# Pick column with most bracket IPA patterns.
# ipa_mode="auto": only when OCR already found bracket IPA (no fallback).
# ipa_mode="all": fallback to headword_col_index from dictionary detection.
if col_bracket_count:
en_col_type = max(col_bracket_count, key=col_bracket_count.get)
elif ipa_mode == "all":
# Force IPA: use headword column from dictionary detection
hw_idx = dict_detection.get("headword_col_index")
if hw_idx is not None:
en_col_type = f"column_{hw_idx + 1}"
if en_col_type:
if txt.strip():
all_content_cols.add(ct)
if '[' in txt or _text_has_garbled_ipa(txt):
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
if col_ipa_count:
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
elif ipa_mode in ("all", "de", "en"):
# Force mode without auto-detection: pick column with most cells
col_cell_count: Dict[str, int] = {}
for cell in all_cells:
if cell.get("col_type") == en_col_type:
cell["_orig_col_type"] = en_col_type
ct = cell.get("col_type", "")
if ct.startswith("column_") and (cell.get("text") or "").strip():
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
if col_cell_count:
en_col_type = max(col_cell_count, key=col_cell_count.get)
# Decide which columns to process based on ipa_mode:
# auto/en: only the detected EN headword column
# de: all content columns EXCEPT the EN column
# all: all content columns
ipa_target_cols: set = set()
if ipa_mode in ("auto", "en"):
if en_col_type:
ipa_target_cols.add(en_col_type)
elif ipa_mode == "de":
ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
elif ipa_mode == "all":
ipa_target_cols = all_content_cols
if ipa_target_cols:
for cell in all_cells:
if cell.get("col_type") in ipa_target_cols:
cell["_orig_col_type"] = cell["col_type"]
cell["col_type"] = "column_en"
# Snapshot text before IPA fix to detect which cells were modified
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
@@ -1476,24 +1497,31 @@ async def _build_grid_core(
# --- Syllable divider insertion for dictionary pages ---
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
# "all" = force syllabification on all content words,
# "none" = skip entirely.
# "all" = force on all content words, "en" = English column only,
# "de" = German columns only, "none" = skip entirely.
syllable_insertions = 0
if syllable_mode != "none" and img_bgr is not None:
_syllable_eligible = False
if syllable_mode == "all":
if syllable_mode in ("all", "de", "en"):
_syllable_eligible = True
elif (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None):
# auto: only on dictionary pages with article columns
_syllable_eligible = True
# For language-specific modes, determine allowed columns
_syllable_col_filter: Optional[set] = None # None = all columns
if syllable_mode == "en" and en_col_type:
_syllable_col_filter = {en_col_type}
elif syllable_mode == "de" and en_col_type:
_syllable_col_filter = all_content_cols - {en_col_type} if total_cols >= 3 else None
if _syllable_eligible:
try:
from cv_syllable_detect import insert_syllable_dividers
force_syllables = (syllable_mode == "all")
force_syllables = (syllable_mode in ("all", "de", "en"))
syllable_insertions = insert_syllable_dividers(
zones_data, img_bgr, session_id,
force=force_syllables,
col_filter=_syllable_col_filter,
)
except Exception as e:
logger.warning("Syllable insertion failed: %s", e)
@@ -1538,7 +1566,7 @@ async def _build_grid_core(
"processing_modes": {
"ipa_mode": ipa_mode,
"syllable_mode": syllable_mode,
"ipa_applied": en_col_type is not None and not skip_ipa,
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
"syllables_applied": syllable_insertions > 0,
},
"duration_seconds": round(duration, 2),
@@ -1554,8 +1582,8 @@ async def _build_grid_core(
@router.post("/sessions/{session_id}/build-grid")
async def build_grid(
session_id: str,
ipa_mode: str = Query("auto", pattern="^(auto|all|none)$"),
syllable_mode: str = Query("auto", pattern="^(auto|all|none)$"),
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
):
"""Build a structured, zone-aware grid from existing Kombi word results.