Add language-specific IPA and syllable modes (de/en)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Extend ipa_mode and syllable_mode toggles with language options: - auto: smart detection (default) - en: only English headword column - de: only German definition columns - all: all content columns - none: skip entirely Also improve English column auto-detection: use garbled IPA patterns (apostrophes, colons) in addition to bracket patterns. This correctly identifies English dictionary pages where OCR produces garbled ASCII instead of bracket IPA. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,12 +21,16 @@ interface GridToolbarProps {
|
||||
|
||||
const IPA_LABELS: Record<IpaMode, string> = {
|
||||
auto: 'IPA: Auto',
|
||||
en: 'IPA: nur EN',
|
||||
de: 'IPA: nur DE',
|
||||
all: 'IPA: Alle',
|
||||
none: 'IPA: Aus',
|
||||
}
|
||||
|
||||
const SYLLABLE_LABELS: Record<SyllableMode, string> = {
|
||||
auto: 'Silben: Original',
|
||||
en: 'Silben: nur EN',
|
||||
de: 'Silben: nur DE',
|
||||
all: 'Silben: Alle',
|
||||
none: 'Silben: Aus',
|
||||
}
|
||||
|
||||
@@ -14,8 +14,8 @@ export interface GridEditorState {
|
||||
selectedZone: number | null
|
||||
}
|
||||
|
||||
export type IpaMode = 'auto' | 'all' | 'none'
|
||||
export type SyllableMode = 'auto' | 'all' | 'none'
|
||||
export type IpaMode = 'auto' | 'all' | 'de' | 'en' | 'none'
|
||||
export type SyllableMode = 'auto' | 'all' | 'de' | 'en' | 'none'
|
||||
|
||||
export function useGridEditor(sessionId: string | null) {
|
||||
const [grid, setGrid] = useState<StructuredGrid | null>(null)
|
||||
|
||||
@@ -196,6 +196,7 @@ def insert_syllable_dividers(
|
||||
session_id: str,
|
||||
*,
|
||||
force: bool = False,
|
||||
col_filter: Optional[set] = None,
|
||||
) -> int:
|
||||
"""Insert pipe syllable dividers into dictionary cells.
|
||||
|
||||
@@ -209,6 +210,8 @@ def insert_syllable_dividers(
|
||||
Args:
|
||||
force: If True, skip the pipe-ratio pre-check and syllabify all
|
||||
content words regardless of whether the original has pipe dividers.
|
||||
col_filter: If set, only process cells whose col_type is in this set.
|
||||
None means process all content columns.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
@@ -247,6 +250,8 @@ def insert_syllable_dividers(
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
if col_filter is not None and ct not in col_filter:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text:
|
||||
continue
|
||||
|
||||
@@ -80,9 +80,11 @@ async def _build_grid_core(
|
||||
session_id: Session identifier (for logging and image loading).
|
||||
session: Full session dict from get_session_db().
|
||||
ipa_mode: "auto" (only when English headwords detected), "all"
|
||||
(force IPA on all content columns), or "none" (skip IPA entirely).
|
||||
(force IPA on all content columns), "en" (English column only),
|
||||
"de" (German/definition columns only), or "none" (skip entirely).
|
||||
syllable_mode: "auto" (only when original has pipe dividers),
|
||||
"all" (force syllabification on all words), or "none" (skip).
|
||||
"all" (force syllabification on all words), "en" (English only),
|
||||
"de" (German only), or "none" (skip).
|
||||
|
||||
Returns:
|
||||
StructuredGrid result dict.
|
||||
@@ -869,32 +871,51 @@ async def _build_grid_core(
|
||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
en_col_type = None
|
||||
ipa_target_cols: set = set()
|
||||
all_content_cols: set = set()
|
||||
skip_ipa = (ipa_mode == "none")
|
||||
if not skip_ipa and total_cols >= 3:
|
||||
# Find the column that contains IPA brackets → English headwords.
|
||||
# Count cells with bracket patterns per col_type. The column with
|
||||
# the most brackets is the headword column (IPA sits after or below
|
||||
# headwords).
|
||||
col_bracket_count: Dict[str, int] = {}
|
||||
# Detect English headword column via IPA signals (brackets or garbled).
|
||||
col_ipa_count: Dict[str, int] = {}
|
||||
all_content_cols: set = set()
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
txt = cell.get("text", "") or ""
|
||||
if ct.startswith("column_") and '[' in txt:
|
||||
col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
|
||||
# Pick column with most bracket IPA patterns.
|
||||
# ipa_mode="auto": only when OCR already found bracket IPA (no fallback).
|
||||
# ipa_mode="all": fallback to headword_col_index from dictionary detection.
|
||||
if col_bracket_count:
|
||||
en_col_type = max(col_bracket_count, key=col_bracket_count.get)
|
||||
elif ipa_mode == "all":
|
||||
# Force IPA: use headword column from dictionary detection
|
||||
hw_idx = dict_detection.get("headword_col_index")
|
||||
if hw_idx is not None:
|
||||
en_col_type = f"column_{hw_idx + 1}"
|
||||
if en_col_type:
|
||||
if txt.strip():
|
||||
all_content_cols.add(ct)
|
||||
if '[' in txt or _text_has_garbled_ipa(txt):
|
||||
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
|
||||
if col_ipa_count:
|
||||
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
|
||||
elif ipa_mode in ("all", "de", "en"):
|
||||
# Force mode without auto-detection: pick column with most cells
|
||||
col_cell_count: Dict[str, int] = {}
|
||||
for cell in all_cells:
|
||||
if cell.get("col_type") == en_col_type:
|
||||
cell["_orig_col_type"] = en_col_type
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
|
||||
if col_cell_count:
|
||||
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
||||
|
||||
# Decide which columns to process based on ipa_mode:
|
||||
# auto/en: only the detected EN headword column
|
||||
# de: all content columns EXCEPT the EN column
|
||||
# all: all content columns
|
||||
ipa_target_cols: set = set()
|
||||
if ipa_mode in ("auto", "en"):
|
||||
if en_col_type:
|
||||
ipa_target_cols.add(en_col_type)
|
||||
elif ipa_mode == "de":
|
||||
ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||
elif ipa_mode == "all":
|
||||
ipa_target_cols = all_content_cols
|
||||
|
||||
if ipa_target_cols:
|
||||
for cell in all_cells:
|
||||
if cell.get("col_type") in ipa_target_cols:
|
||||
cell["_orig_col_type"] = cell["col_type"]
|
||||
cell["col_type"] = "column_en"
|
||||
# Snapshot text before IPA fix to detect which cells were modified
|
||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||
@@ -1476,24 +1497,31 @@ async def _build_grid_core(
|
||||
|
||||
# --- Syllable divider insertion for dictionary pages ---
|
||||
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
||||
# "all" = force syllabification on all content words,
|
||||
# "none" = skip entirely.
|
||||
# "all" = force on all content words, "en" = English column only,
|
||||
# "de" = German columns only, "none" = skip entirely.
|
||||
syllable_insertions = 0
|
||||
if syllable_mode != "none" and img_bgr is not None:
|
||||
_syllable_eligible = False
|
||||
if syllable_mode == "all":
|
||||
if syllable_mode in ("all", "de", "en"):
|
||||
_syllable_eligible = True
|
||||
elif (dict_detection.get("is_dictionary")
|
||||
and dict_detection.get("article_col_index") is not None):
|
||||
# auto: only on dictionary pages with article columns
|
||||
_syllable_eligible = True
|
||||
# For language-specific modes, determine allowed columns
|
||||
_syllable_col_filter: Optional[set] = None # None = all columns
|
||||
if syllable_mode == "en" and en_col_type:
|
||||
_syllable_col_filter = {en_col_type}
|
||||
elif syllable_mode == "de" and en_col_type:
|
||||
_syllable_col_filter = all_content_cols - {en_col_type} if total_cols >= 3 else None
|
||||
if _syllable_eligible:
|
||||
try:
|
||||
from cv_syllable_detect import insert_syllable_dividers
|
||||
force_syllables = (syllable_mode == "all")
|
||||
force_syllables = (syllable_mode in ("all", "de", "en"))
|
||||
syllable_insertions = insert_syllable_dividers(
|
||||
zones_data, img_bgr, session_id,
|
||||
force=force_syllables,
|
||||
col_filter=_syllable_col_filter,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Syllable insertion failed: %s", e)
|
||||
@@ -1538,7 +1566,7 @@ async def _build_grid_core(
|
||||
"processing_modes": {
|
||||
"ipa_mode": ipa_mode,
|
||||
"syllable_mode": syllable_mode,
|
||||
"ipa_applied": en_col_type is not None and not skip_ipa,
|
||||
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
||||
"syllables_applied": syllable_insertions > 0,
|
||||
},
|
||||
"duration_seconds": round(duration, 2),
|
||||
@@ -1554,8 +1582,8 @@ async def _build_grid_core(
|
||||
@router.post("/sessions/{session_id}/build-grid")
|
||||
async def build_grid(
|
||||
session_id: str,
|
||||
ipa_mode: str = Query("auto", pattern="^(auto|all|none)$"),
|
||||
syllable_mode: str = Query("auto", pattern="^(auto|all|none)$"),
|
||||
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||
):
|
||||
"""Build a structured, zone-aware grid from existing Kombi word results.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user