From 34680732f8bb82ef350021cf99790a7a6826c80e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 25 Mar 2026 08:04:44 +0100 Subject: [PATCH] Add IPA and syllable mode toggles, fix false IPA on German documents Backend: Remove en_col_type fallback heuristic (longest avg text) that incorrectly identified German columns as English. IPA now only applied when OCR bracket patterns are actually found. Add ipa_mode (auto/all/none) and syllable_mode (auto/all/none) query params to build-grid API. Frontend: Add IPA and Silben dropdown selects to GridToolbar. Modes are passed as query params on rebuild. Auto = current smart detection, All = force for all words, Aus = skip entirely. Co-Authored-By: Claude Opus 4.6 --- .../components/grid-editor/GridEditor.tsx | 8 ++ .../components/grid-editor/GridToolbar.tsx | 46 ++++++++ .../components/grid-editor/useGridEditor.ts | 16 ++- .../ocr-pipeline/StepGridReview.tsx | 8 ++ klausur-service/backend/cv_syllable_detect.py | 41 ++++--- klausur-service/backend/grid_editor_api.py | 101 +++++++++++------- 6 files changed, 165 insertions(+), 55 deletions(-) diff --git a/admin-lehrer/components/grid-editor/GridEditor.tsx b/admin-lehrer/components/grid-editor/GridEditor.tsx index 8d7ee12..0f1c928 100644 --- a/admin-lehrer/components/grid-editor/GridEditor.tsx +++ b/admin-lehrer/components/grid-editor/GridEditor.tsx @@ -36,6 +36,10 @@ export function GridEditor({ sessionId, onNext }: GridEditorProps) { addColumn, deleteRow, addRow, + ipaMode, + setIpaMode, + syllableMode, + setSyllableMode, } = useGridEditor(sessionId) const [showOverlay, setShowOverlay] = useState(false) @@ -183,11 +187,15 @@ export function GridEditor({ sessionId, onNext }: GridEditorProps) { canUndo={canUndo} canRedo={canRedo} showOverlay={showOverlay} + ipaMode={ipaMode} + syllableMode={syllableMode} onSave={saveGrid} onUndo={undo} onRedo={redo} onRebuild={buildGrid} onToggleOverlay={() => setShowOverlay(!showOverlay)} + onIpaModeChange={setIpaMode} + onSyllableModeChange={setSyllableMode} /> diff --git a/admin-lehrer/components/grid-editor/GridToolbar.tsx b/admin-lehrer/components/grid-editor/GridToolbar.tsx index dc7100a..0113f15 100644 --- a/admin-lehrer/components/grid-editor/GridToolbar.tsx +++ b/admin-lehrer/components/grid-editor/GridToolbar.tsx @@ -1,16 +1,34 @@ 'use client' +import type { IpaMode, SyllableMode } from './useGridEditor' + interface GridToolbarProps { dirty: boolean saving: boolean canUndo: boolean canRedo: boolean showOverlay: boolean + ipaMode: IpaMode + syllableMode: SyllableMode onSave: () => void onUndo: () => void onRedo: () => void onRebuild: () => void onToggleOverlay: () => void + onIpaModeChange: (mode: IpaMode) => void + onSyllableModeChange: (mode: SyllableMode) => void +} + +const IPA_LABELS: Record = { + auto: 'IPA: Auto', + all: 'IPA: Alle', + none: 'IPA: Aus', +} + +const SYLLABLE_LABELS: Record = { + auto: 'Silben: Original', + all: 'Silben: Alle', + none: 'Silben: Aus', } export function GridToolbar({ @@ -19,11 +37,15 @@ export function GridToolbar({ canUndo, canRedo, showOverlay, + ipaMode, + syllableMode, onSave, onUndo, onRedo, onRebuild, onToggleOverlay, + onIpaModeChange, + onSyllableModeChange, }: GridToolbarProps) { return (
@@ -67,6 +89,30 @@ export function GridToolbar({ Bild-Overlay + {/* IPA mode */} + + + {/* Syllable mode */} + + {/* Rebuild */}
diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py index 9057afc..e86ef09 100644 --- a/klausur-service/backend/cv_syllable_detect.py +++ b/klausur-service/backend/cv_syllable_detect.py @@ -194,6 +194,8 @@ def insert_syllable_dividers( zones_data: List[Dict], img_bgr: np.ndarray, session_id: str, + *, + force: bool = False, ) -> int: """Insert pipe syllable dividers into dictionary cells. @@ -204,6 +206,10 @@ def insert_syllable_dividers( OCR. This guards against pages with zero pipe characters (the primary guard — article_col_index — is checked at the call site). + Args: + force: If True, skip the pipe-ratio pre-check and syllabify all + content words regardless of whether the original has pipe dividers. + Returns the number of cells modified. """ hyph_de, hyph_en = _get_hyphenators() @@ -215,24 +221,25 @@ def insert_syllable_dividers( # Real dictionary pages with printed syllable dividers will have OCR- # detected pipes in many cells. Pages without syllable dividers will # have zero — skip those to avoid false syllabification. - total_col_cells = 0 - cells_with_pipes = 0 - for z in zones_data: - for cell in z.get("cells", []): - if cell.get("col_type", "").startswith("column_"): - total_col_cells += 1 - if "|" in cell.get("text", ""): - cells_with_pipes += 1 + if not force: + total_col_cells = 0 + cells_with_pipes = 0 + for z in zones_data: + for cell in z.get("cells", []): + if cell.get("col_type", "").startswith("column_"): + total_col_cells += 1 + if "|" in cell.get("text", ""): + cells_with_pipes += 1 - if total_col_cells > 0: - pipe_ratio = cells_with_pipes / total_col_cells - if pipe_ratio < 0.01: - logger.info( - "build-grid session %s: skipping syllable insertion — " - "only %.1f%% of cells have existing pipes (need >=1%%)", - session_id, pipe_ratio * 100, - ) - return 0 + if total_col_cells > 0: + pipe_ratio = cells_with_pipes / total_col_cells + if pipe_ratio < 0.01: + logger.info( + "build-grid session %s: skipping syllable insertion — " + "only %.1f%% of cells have existing pipes (need >=1%%)", + session_id, pipe_ratio * 100, + ) + return 0 insertions = 0 for z in zones_data: diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 69fe315..cc190c0 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -18,7 +18,7 @@ from typing import Any, Dict, List, Optional, Tuple import cv2 import numpy as np -from fastapi import APIRouter, HTTPException, Request +from fastapi import APIRouter, HTTPException, Query, Request from cv_box_detect import detect_boxes, split_page_into_zones from cv_graphic_detect import detect_graphic_elements @@ -67,12 +67,22 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) # Core computation (used by build-grid endpoint and regression tests) # --------------------------------------------------------------------------- -async def _build_grid_core(session_id: str, session: dict) -> dict: +async def _build_grid_core( + session_id: str, + session: dict, + *, + ipa_mode: str = "auto", + syllable_mode: str = "auto", +) -> dict: """Core grid building logic — pure computation, no HTTP or DB side effects. Args: session_id: Session identifier (for logging and image loading). session: Full session dict from get_session_db(). + ipa_mode: "auto" (only when English headwords detected), "all" + (force IPA on all content columns), or "none" (skip IPA entirely). + syllable_mode: "auto" (only when original has pipe dividers), + "all" (force syllabification on all words), or "none" (skip). Returns: StructuredGrid result dict. @@ -859,32 +869,28 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: all_cells = [cell for z in zones_data for cell in z.get("cells", [])] total_cols = sum(len(z.get("columns", [])) for z in zones_data) en_col_type = None - if total_cols >= 3: + skip_ipa = (ipa_mode == "none") + if not skip_ipa and total_cols >= 3: # Find the column that contains IPA brackets → English headwords. # Count cells with bracket patterns per col_type. The column with # the most brackets is the headword column (IPA sits after or below - # headwords). Falls back to longest-average if no brackets found. + # headwords). col_bracket_count: Dict[str, int] = {} - col_avg_len: Dict[str, List[int]] = {} for cell in all_cells: ct = cell.get("col_type", "") txt = cell.get("text", "") or "" - col_avg_len.setdefault(ct, []).append(len(txt)) if ct.startswith("column_") and '[' in txt: col_bracket_count[ct] = col_bracket_count.get(ct, 0) + 1 - # Pick column with most bracket IPA patterns + # Pick column with most bracket IPA patterns. + # ipa_mode="auto": only when OCR already found bracket IPA (no fallback). + # ipa_mode="all": fallback to headword_col_index from dictionary detection. if col_bracket_count: en_col_type = max(col_bracket_count, key=col_bracket_count.get) - else: - # Fallback: longest average text - best_avg = 0 - for ct, lengths in col_avg_len.items(): - if not ct.startswith("column_"): - continue - avg = sum(lengths) / len(lengths) if lengths else 0 - if avg > best_avg: - best_avg = avg - en_col_type = ct + elif ipa_mode == "all": + # Force IPA: use headword column from dictionary detection + hw_idx = dict_detection.get("headword_col_index") + if hw_idx is not None: + en_col_type = f"column_{hw_idx + 1}" if en_col_type: for cell in all_cells: if cell.get("col_type") == en_col_type: @@ -912,7 +918,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # the EN headword column may not be the longest-average column. _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") ipa_cont_fixed = 0 - for z in zones_data: + for z in ([] if skip_ipa else zones_data): rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) z_cells = z.get("cells", []) for idx, row in enumerate(rows_sorted): @@ -1110,7 +1116,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # Reject matches that look like grammar: "sb/sth up a) jdn/" _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') slash_ipa_fixed = 0 - for z in zones_data: + for z in ([] if skip_ipa else zones_data): for cell in z.get("cells", []): # Only process English headword column — avoid converting # German text like "der/die/das" to IPA. @@ -1469,22 +1475,28 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: logger.warning("Dictionary detection failed: %s", e) # --- Syllable divider insertion for dictionary pages --- - # Only on confirmed dictionary pages with article columns (der/die/das). - # The article_col_index check avoids false positives on synonym lists, - # word frequency tables, and other alphabetically sorted non-dictionary pages. - # Additionally, insert_syllable_dividers has its own pre-check for existing - # pipe characters in cells (OCR must have already found some). + # syllable_mode: "auto" = only when original has pipe dividers (1% threshold), + # "all" = force syllabification on all content words, + # "none" = skip entirely. syllable_insertions = 0 - if (dict_detection.get("is_dictionary") - and dict_detection.get("article_col_index") is not None - and img_bgr is not None): - try: - from cv_syllable_detect import insert_syllable_dividers - syllable_insertions = insert_syllable_dividers( - zones_data, img_bgr, session_id, - ) - except Exception as e: - logger.warning("Syllable insertion failed: %s", e) + if syllable_mode != "none" and img_bgr is not None: + _syllable_eligible = False + if syllable_mode == "all": + _syllable_eligible = True + elif (dict_detection.get("is_dictionary") + and dict_detection.get("article_col_index") is not None): + # auto: only on dictionary pages with article columns + _syllable_eligible = True + if _syllable_eligible: + try: + from cv_syllable_detect import insert_syllable_dividers + force_syllables = (syllable_mode == "all") + syllable_insertions = insert_syllable_dividers( + zones_data, img_bgr, session_id, + force=force_syllables, + ) + except Exception as e: + logger.warning("Syllable insertion failed: %s", e) # Clean up internal flags before returning for z in zones_data: @@ -1523,6 +1535,12 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: "article_col_index": dict_detection.get("article_col_index"), "headword_col_index": dict_detection.get("headword_col_index"), }, + "processing_modes": { + "ipa_mode": ipa_mode, + "syllable_mode": syllable_mode, + "ipa_applied": en_col_type is not None and not skip_ipa, + "syllables_applied": syllable_insertions > 0, + }, "duration_seconds": round(duration, 2), } @@ -1534,12 +1552,20 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # --------------------------------------------------------------------------- @router.post("/sessions/{session_id}/build-grid") -async def build_grid(session_id: str): +async def build_grid( + session_id: str, + ipa_mode: str = Query("auto", pattern="^(auto|all|none)$"), + syllable_mode: str = Query("auto", pattern="^(auto|all|none)$"), +): """Build a structured, zone-aware grid from existing Kombi word results. Requires that paddle-kombi or rapid-kombi has already been run on the session. Uses the image for box detection and the word positions for grid structuring. + Query params: + ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip) + syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip) + Returns a StructuredGrid with zones, each containing their own columns, rows, and cells — ready for the frontend Excel-like editor. """ @@ -1548,7 +1574,10 @@ async def build_grid(session_id: str): raise HTTPException(status_code=404, detail=f"Session {session_id} not found") try: - result = await _build_grid_core(session_id, session) + result = await _build_grid_core( + session_id, session, + ipa_mode=ipa_mode, syllable_mode=syllable_mode, + ) except ValueError as e: raise HTTPException(status_code=400, detail=str(e))