Add IPA and syllable mode toggles, fix false IPA on German documents
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m1s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s

Backend: Remove en_col_type fallback heuristic (longest avg text) that
incorrectly identified German columns as English. IPA now only applied
when OCR bracket patterns are actually found. Add ipa_mode (auto/all/none)
and syllable_mode (auto/all/none) query params to build-grid API.

Frontend: Add IPA and Silben dropdown selects to GridToolbar. Modes
are passed as query params on rebuild. Auto = current smart detection,
All = force for all words, Aus = skip entirely.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-25 08:04:44 +01:00
parent c42924a94a
commit 34680732f8
6 changed files with 165 additions and 55 deletions

View File

@@ -194,6 +194,8 @@ def insert_syllable_dividers(
zones_data: List[Dict],
img_bgr: np.ndarray,
session_id: str,
*,
force: bool = False,
) -> int:
"""Insert pipe syllable dividers into dictionary cells.
@@ -204,6 +206,10 @@ def insert_syllable_dividers(
OCR. This guards against pages with zero pipe characters (the primary
guard — article_col_index — is checked at the call site).
Args:
force: If True, skip the pipe-ratio pre-check and syllabify all
content words regardless of whether the original has pipe dividers.
Returns the number of cells modified.
"""
hyph_de, hyph_en = _get_hyphenators()
@@ -215,24 +221,25 @@ def insert_syllable_dividers(
# Real dictionary pages with printed syllable dividers will have OCR-
# detected pipes in many cells. Pages without syllable dividers will
# have zero — skip those to avoid false syllabification.
total_col_cells = 0
cells_with_pipes = 0
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type", "").startswith("column_"):
total_col_cells += 1
if "|" in cell.get("text", ""):
cells_with_pipes += 1
if not force:
total_col_cells = 0
cells_with_pipes = 0
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type", "").startswith("column_"):
total_col_cells += 1
if "|" in cell.get("text", ""):
cells_with_pipes += 1
if total_col_cells > 0:
pipe_ratio = cells_with_pipes / total_col_cells
if pipe_ratio < 0.01:
logger.info(
"build-grid session %s: skipping syllable insertion — "
"only %.1f%% of cells have existing pipes (need >=1%%)",
session_id, pipe_ratio * 100,
)
return 0
if total_col_cells > 0:
pipe_ratio = cells_with_pipes / total_col_cells
if pipe_ratio < 0.01:
logger.info(
"build-grid session %s: skipping syllable insertion — "
"only %.1f%% of cells have existing pipes (need >=1%%)",
session_id, pipe_ratio * 100,
)
return 0
insertions = 0
for z in zones_data:

View File

@@ -18,7 +18,7 @@ from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np
from fastapi import APIRouter, HTTPException, Request
from fastapi import APIRouter, HTTPException, Query, Request
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_graphic_detect import detect_graphic_elements
@@ -67,12 +67,22 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
# Core computation (used by build-grid endpoint and regression tests)
# ---------------------------------------------------------------------------
async def _build_grid_core(session_id: str, session: dict) -> dict:
async def _build_grid_core(
session_id: str,
session: dict,
*,
ipa_mode: str = "auto",
syllable_mode: str = "auto",
) -> dict:
"""Core grid building logic — pure computation, no HTTP or DB side effects.
Args:
session_id: Session identifier (for logging and image loading).
session: Full session dict from get_session_db().
ipa_mode: "auto" (only when English headwords detected), "all"
(force IPA on all content columns), or "none" (skip IPA entirely).
syllable_mode: "auto" (only when original has pipe dividers),
"all" (force syllabification on all words), or "none" (skip).
Returns:
StructuredGrid result dict.
@@ -859,32 +869,28 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
en_col_type = None
if total_cols >= 3:
skip_ipa = (ipa_mode == "none")
if not skip_ipa and total_cols >= 3:
# Find the column that contains IPA brackets → English headwords.
# Count cells with bracket patterns per col_type. The column with
# the most brackets is the headword column (IPA sits after or below
# headwords). Falls back to longest-average if no brackets found.
# headwords).
col_bracket_count: Dict[str, int] = {}
col_avg_len: Dict[str, List[int]] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
txt = cell.get("text", "") or ""
col_avg_len.setdefault(ct, []).append(len(txt))
if ct.startswith("column_") and '[' in txt:
col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1
# Pick column with most bracket IPA patterns
# Pick column with most bracket IPA patterns.
# ipa_mode="auto": only when OCR already found bracket IPA (no fallback).
# ipa_mode="all": fallback to headword_col_index from dictionary detection.
if col_bracket_count:
en_col_type = max(col_bracket_count, key=col_bracket_count.get)
else:
# Fallback: longest average text
best_avg = 0
for ct, lengths in col_avg_len.items():
if not ct.startswith("column_"):
continue
avg = sum(lengths) / len(lengths) if lengths else 0
if avg > best_avg:
best_avg = avg
en_col_type = ct
elif ipa_mode == "all":
# Force IPA: use headword column from dictionary detection
hw_idx = dict_detection.get("headword_col_index")
if hw_idx is not None:
en_col_type = f"column_{hw_idx + 1}"
if en_col_type:
for cell in all_cells:
if cell.get("col_type") == en_col_type:
@@ -912,7 +918,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# the EN headword column may not be the longest-average column.
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
ipa_cont_fixed = 0
for z in zones_data:
for z in ([] if skip_ipa else zones_data):
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
z_cells = z.get("cells", [])
for idx, row in enumerate(rows_sorted):
@@ -1110,7 +1116,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# Reject matches that look like grammar: "sb/sth up a) jdn/"
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
slash_ipa_fixed = 0
for z in zones_data:
for z in ([] if skip_ipa else zones_data):
for cell in z.get("cells", []):
# Only process English headword column — avoid converting
# German text like "der/die/das" to IPA.
@@ -1469,22 +1475,28 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
logger.warning("Dictionary detection failed: %s", e)
# --- Syllable divider insertion for dictionary pages ---
# Only on confirmed dictionary pages with article columns (der/die/das).
# The article_col_index check avoids false positives on synonym lists,
# word frequency tables, and other alphabetically sorted non-dictionary pages.
# Additionally, insert_syllable_dividers has its own pre-check for existing
# pipe characters in cells (OCR must have already found some).
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
# "all" = force syllabification on all content words,
# "none" = skip entirely.
syllable_insertions = 0
if (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None
and img_bgr is not None):
try:
from cv_syllable_detect import insert_syllable_dividers
syllable_insertions = insert_syllable_dividers(
zones_data, img_bgr, session_id,
)
except Exception as e:
logger.warning("Syllable insertion failed: %s", e)
if syllable_mode != "none" and img_bgr is not None:
_syllable_eligible = False
if syllable_mode == "all":
_syllable_eligible = True
elif (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None):
# auto: only on dictionary pages with article columns
_syllable_eligible = True
if _syllable_eligible:
try:
from cv_syllable_detect import insert_syllable_dividers
force_syllables = (syllable_mode == "all")
syllable_insertions = insert_syllable_dividers(
zones_data, img_bgr, session_id,
force=force_syllables,
)
except Exception as e:
logger.warning("Syllable insertion failed: %s", e)
# Clean up internal flags before returning
for z in zones_data:
@@ -1523,6 +1535,12 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"article_col_index": dict_detection.get("article_col_index"),
"headword_col_index": dict_detection.get("headword_col_index"),
},
"processing_modes": {
"ipa_mode": ipa_mode,
"syllable_mode": syllable_mode,
"ipa_applied": en_col_type is not None and not skip_ipa,
"syllables_applied": syllable_insertions > 0,
},
"duration_seconds": round(duration, 2),
}
@@ -1534,12 +1552,20 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/build-grid")
async def build_grid(session_id: str):
async def build_grid(
session_id: str,
ipa_mode: str = Query("auto", pattern="^(auto|all|none)$"),
syllable_mode: str = Query("auto", pattern="^(auto|all|none)$"),
):
"""Build a structured, zone-aware grid from existing Kombi word results.
Requires that paddle-kombi or rapid-kombi has already been run on the session.
Uses the image for box detection and the word positions for grid structuring.
Query params:
ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip)
syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip)
Returns a StructuredGrid with zones, each containing their own
columns, rows, and cells — ready for the frontend Excel-like editor.
"""
@@ -1548,7 +1574,10 @@ async def build_grid(session_id: str):
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
try:
result = await _build_grid_core(session_id, session)
result = await _build_grid_core(
session_id, session,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))