Add language-specific IPA and syllable modes (de/en)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Extend ipa_mode and syllable_mode toggles with language options: - auto: smart detection (default) - en: only English headword column - de: only German definition columns - all: all content columns - none: skip entirely Also improve English column auto-detection: use garbled IPA patterns (apostrophes, colons) in addition to bracket patterns. This correctly identifies English dictionary pages where OCR produces garbled ASCII instead of bracket IPA. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,12 +21,16 @@ interface GridToolbarProps {
|
|||||||
|
|
||||||
const IPA_LABELS: Record<IpaMode, string> = {
|
const IPA_LABELS: Record<IpaMode, string> = {
|
||||||
auto: 'IPA: Auto',
|
auto: 'IPA: Auto',
|
||||||
|
en: 'IPA: nur EN',
|
||||||
|
de: 'IPA: nur DE',
|
||||||
all: 'IPA: Alle',
|
all: 'IPA: Alle',
|
||||||
none: 'IPA: Aus',
|
none: 'IPA: Aus',
|
||||||
}
|
}
|
||||||
|
|
||||||
const SYLLABLE_LABELS: Record<SyllableMode, string> = {
|
const SYLLABLE_LABELS: Record<SyllableMode, string> = {
|
||||||
auto: 'Silben: Original',
|
auto: 'Silben: Original',
|
||||||
|
en: 'Silben: nur EN',
|
||||||
|
de: 'Silben: nur DE',
|
||||||
all: 'Silben: Alle',
|
all: 'Silben: Alle',
|
||||||
none: 'Silben: Aus',
|
none: 'Silben: Aus',
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,8 +14,8 @@ export interface GridEditorState {
|
|||||||
selectedZone: number | null
|
selectedZone: number | null
|
||||||
}
|
}
|
||||||
|
|
||||||
export type IpaMode = 'auto' | 'all' | 'none'
|
export type IpaMode = 'auto' | 'all' | 'de' | 'en' | 'none'
|
||||||
export type SyllableMode = 'auto' | 'all' | 'none'
|
export type SyllableMode = 'auto' | 'all' | 'de' | 'en' | 'none'
|
||||||
|
|
||||||
export function useGridEditor(sessionId: string | null) {
|
export function useGridEditor(sessionId: string | null) {
|
||||||
const [grid, setGrid] = useState<StructuredGrid | null>(null)
|
const [grid, setGrid] = useState<StructuredGrid | null>(null)
|
||||||
|
|||||||
@@ -196,6 +196,7 @@ def insert_syllable_dividers(
|
|||||||
session_id: str,
|
session_id: str,
|
||||||
*,
|
*,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
|
col_filter: Optional[set] = None,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Insert pipe syllable dividers into dictionary cells.
|
"""Insert pipe syllable dividers into dictionary cells.
|
||||||
|
|
||||||
@@ -209,6 +210,8 @@ def insert_syllable_dividers(
|
|||||||
Args:
|
Args:
|
||||||
force: If True, skip the pipe-ratio pre-check and syllabify all
|
force: If True, skip the pipe-ratio pre-check and syllabify all
|
||||||
content words regardless of whether the original has pipe dividers.
|
content words regardless of whether the original has pipe dividers.
|
||||||
|
col_filter: If set, only process cells whose col_type is in this set.
|
||||||
|
None means process all content columns.
|
||||||
|
|
||||||
Returns the number of cells modified.
|
Returns the number of cells modified.
|
||||||
"""
|
"""
|
||||||
@@ -247,6 +250,8 @@ def insert_syllable_dividers(
|
|||||||
ct = cell.get("col_type", "")
|
ct = cell.get("col_type", "")
|
||||||
if not ct.startswith("column_"):
|
if not ct.startswith("column_"):
|
||||||
continue
|
continue
|
||||||
|
if col_filter is not None and ct not in col_filter:
|
||||||
|
continue
|
||||||
text = cell.get("text", "")
|
text = cell.get("text", "")
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -80,9 +80,11 @@ async def _build_grid_core(
|
|||||||
session_id: Session identifier (for logging and image loading).
|
session_id: Session identifier (for logging and image loading).
|
||||||
session: Full session dict from get_session_db().
|
session: Full session dict from get_session_db().
|
||||||
ipa_mode: "auto" (only when English headwords detected), "all"
|
ipa_mode: "auto" (only when English headwords detected), "all"
|
||||||
(force IPA on all content columns), or "none" (skip IPA entirely).
|
(force IPA on all content columns), "en" (English column only),
|
||||||
|
"de" (German/definition columns only), or "none" (skip entirely).
|
||||||
syllable_mode: "auto" (only when original has pipe dividers),
|
syllable_mode: "auto" (only when original has pipe dividers),
|
||||||
"all" (force syllabification on all words), or "none" (skip).
|
"all" (force syllabification on all words), "en" (English only),
|
||||||
|
"de" (German only), or "none" (skip).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
StructuredGrid result dict.
|
StructuredGrid result dict.
|
||||||
@@ -869,32 +871,51 @@ async def _build_grid_core(
|
|||||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||||
en_col_type = None
|
en_col_type = None
|
||||||
|
ipa_target_cols: set = set()
|
||||||
|
all_content_cols: set = set()
|
||||||
skip_ipa = (ipa_mode == "none")
|
skip_ipa = (ipa_mode == "none")
|
||||||
if not skip_ipa and total_cols >= 3:
|
if not skip_ipa and total_cols >= 3:
|
||||||
# Find the column that contains IPA brackets → English headwords.
|
# Detect English headword column via IPA signals (brackets or garbled).
|
||||||
# Count cells with bracket patterns per col_type. The column with
|
col_ipa_count: Dict[str, int] = {}
|
||||||
# the most brackets is the headword column (IPA sits after or below
|
all_content_cols: set = set()
|
||||||
# headwords).
|
|
||||||
col_bracket_count: Dict[str, int] = {}
|
|
||||||
for cell in all_cells:
|
for cell in all_cells:
|
||||||
ct = cell.get("col_type", "")
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
txt = cell.get("text", "") or ""
|
txt = cell.get("text", "") or ""
|
||||||
if ct.startswith("column_") and '[' in txt:
|
if txt.strip():
|
||||||
col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
|
all_content_cols.add(ct)
|
||||||
# Pick column with most bracket IPA patterns.
|
if '[' in txt or _text_has_garbled_ipa(txt):
|
||||||
# ipa_mode="auto": only when OCR already found bracket IPA (no fallback).
|
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
|
||||||
# ipa_mode="all": fallback to headword_col_index from dictionary detection.
|
if col_ipa_count:
|
||||||
if col_bracket_count:
|
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
|
||||||
en_col_type = max(col_bracket_count, key=col_bracket_count.get)
|
elif ipa_mode in ("all", "de", "en"):
|
||||||
elif ipa_mode == "all":
|
# Force mode without auto-detection: pick column with most cells
|
||||||
# Force IPA: use headword column from dictionary detection
|
col_cell_count: Dict[str, int] = {}
|
||||||
hw_idx = dict_detection.get("headword_col_index")
|
|
||||||
if hw_idx is not None:
|
|
||||||
en_col_type = f"column_{hw_idx + 1}"
|
|
||||||
if en_col_type:
|
|
||||||
for cell in all_cells:
|
for cell in all_cells:
|
||||||
if cell.get("col_type") == en_col_type:
|
ct = cell.get("col_type", "")
|
||||||
cell["_orig_col_type"] = en_col_type
|
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||||
|
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
|
||||||
|
if col_cell_count:
|
||||||
|
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
||||||
|
|
||||||
|
# Decide which columns to process based on ipa_mode:
|
||||||
|
# auto/en: only the detected EN headword column
|
||||||
|
# de: all content columns EXCEPT the EN column
|
||||||
|
# all: all content columns
|
||||||
|
ipa_target_cols: set = set()
|
||||||
|
if ipa_mode in ("auto", "en"):
|
||||||
|
if en_col_type:
|
||||||
|
ipa_target_cols.add(en_col_type)
|
||||||
|
elif ipa_mode == "de":
|
||||||
|
ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||||
|
elif ipa_mode == "all":
|
||||||
|
ipa_target_cols = all_content_cols
|
||||||
|
|
||||||
|
if ipa_target_cols:
|
||||||
|
for cell in all_cells:
|
||||||
|
if cell.get("col_type") in ipa_target_cols:
|
||||||
|
cell["_orig_col_type"] = cell["col_type"]
|
||||||
cell["col_type"] = "column_en"
|
cell["col_type"] = "column_en"
|
||||||
# Snapshot text before IPA fix to detect which cells were modified
|
# Snapshot text before IPA fix to detect which cells were modified
|
||||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||||
@@ -1476,24 +1497,31 @@ async def _build_grid_core(
|
|||||||
|
|
||||||
# --- Syllable divider insertion for dictionary pages ---
|
# --- Syllable divider insertion for dictionary pages ---
|
||||||
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
||||||
# "all" = force syllabification on all content words,
|
# "all" = force on all content words, "en" = English column only,
|
||||||
# "none" = skip entirely.
|
# "de" = German columns only, "none" = skip entirely.
|
||||||
syllable_insertions = 0
|
syllable_insertions = 0
|
||||||
if syllable_mode != "none" and img_bgr is not None:
|
if syllable_mode != "none" and img_bgr is not None:
|
||||||
_syllable_eligible = False
|
_syllable_eligible = False
|
||||||
if syllable_mode == "all":
|
if syllable_mode in ("all", "de", "en"):
|
||||||
_syllable_eligible = True
|
_syllable_eligible = True
|
||||||
elif (dict_detection.get("is_dictionary")
|
elif (dict_detection.get("is_dictionary")
|
||||||
and dict_detection.get("article_col_index") is not None):
|
and dict_detection.get("article_col_index") is not None):
|
||||||
# auto: only on dictionary pages with article columns
|
# auto: only on dictionary pages with article columns
|
||||||
_syllable_eligible = True
|
_syllable_eligible = True
|
||||||
|
# For language-specific modes, determine allowed columns
|
||||||
|
_syllable_col_filter: Optional[set] = None # None = all columns
|
||||||
|
if syllable_mode == "en" and en_col_type:
|
||||||
|
_syllable_col_filter = {en_col_type}
|
||||||
|
elif syllable_mode == "de" and en_col_type:
|
||||||
|
_syllable_col_filter = all_content_cols - {en_col_type} if total_cols >= 3 else None
|
||||||
if _syllable_eligible:
|
if _syllable_eligible:
|
||||||
try:
|
try:
|
||||||
from cv_syllable_detect import insert_syllable_dividers
|
from cv_syllable_detect import insert_syllable_dividers
|
||||||
force_syllables = (syllable_mode == "all")
|
force_syllables = (syllable_mode in ("all", "de", "en"))
|
||||||
syllable_insertions = insert_syllable_dividers(
|
syllable_insertions = insert_syllable_dividers(
|
||||||
zones_data, img_bgr, session_id,
|
zones_data, img_bgr, session_id,
|
||||||
force=force_syllables,
|
force=force_syllables,
|
||||||
|
col_filter=_syllable_col_filter,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Syllable insertion failed: %s", e)
|
logger.warning("Syllable insertion failed: %s", e)
|
||||||
@@ -1538,7 +1566,7 @@ async def _build_grid_core(
|
|||||||
"processing_modes": {
|
"processing_modes": {
|
||||||
"ipa_mode": ipa_mode,
|
"ipa_mode": ipa_mode,
|
||||||
"syllable_mode": syllable_mode,
|
"syllable_mode": syllable_mode,
|
||||||
"ipa_applied": en_col_type is not None and not skip_ipa,
|
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
||||||
"syllables_applied": syllable_insertions > 0,
|
"syllables_applied": syllable_insertions > 0,
|
||||||
},
|
},
|
||||||
"duration_seconds": round(duration, 2),
|
"duration_seconds": round(duration, 2),
|
||||||
@@ -1554,8 +1582,8 @@ async def _build_grid_core(
|
|||||||
@router.post("/sessions/{session_id}/build-grid")
|
@router.post("/sessions/{session_id}/build-grid")
|
||||||
async def build_grid(
|
async def build_grid(
|
||||||
session_id: str,
|
session_id: str,
|
||||||
ipa_mode: str = Query("auto", pattern="^(auto|all|none)$"),
|
ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
syllable_mode: str = Query("auto", pattern="^(auto|all|none)$"),
|
syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"),
|
||||||
):
|
):
|
||||||
"""Build a structured, zone-aware grid from existing Kombi word results.
|
"""Build a structured, zone-aware grid from existing Kombi word results.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user