Replace old OCR pipeline with Kombi pipeline + add IPA/syllable toggles

Backend: - _run_ocr_pipeline_for_page() now runs the full Kombi pipeline: orientation → deskew → dewarp → content crop → dual-engine OCR (RapidOCR + Tesseract merge) → _build_grid_core() with pipe-autocorrect, word-gap merge, dictionary detection - Accepts ipa_mode and syllable_mode query params on process-single-page - Pipeline sessions are visible in admin OCR Kombi UI for debugging Frontend (vocab-worksheet): - New "Anzeigeoptionen" section with IPA and syllable toggles - Settings are passed to process-single-page as query parameters Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 00:43:42 +02:00
parent 2828871e42
commit 3b78baf37f
2 changed files with 235 additions and 137 deletions
@@ -1283,12 +1283,18 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
 async def process_single_page(
    session_id: str,
    page_number: int,
+    ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
+    syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
 ):
    """
-    Process a SINGLE page of an uploaded PDF using the OCR pipeline.
+    Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.

-    Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words)
-    instead of LLM vision for much better extraction quality.
+    Uses the full Kombi pipeline (orientation → deskew → dewarp → crop →
+    dual-engine OCR → grid-build with autocorrect/merge) for best quality.
+
+    Query params:
+        ipa_mode: "none" (default), "auto", "all", "en", "de"
+        syllable_mode: "none" (default), "auto", "all", "en", "de"

    The frontend should call this sequentially for each page.
    Returns the vocabulary for just this one page.
@@ -1316,6 +1322,7 @@ async def process_single_page(
            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
            page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
                img_bgr, page_number, session_id,
+                ipa_mode=ipa_mode, syllable_mode=syllable_mode,
            )
        except Exception as e:
            logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
@@ -1384,28 +1391,33 @@ async def _run_ocr_pipeline_for_page(
    img_bgr: np.ndarray,
    page_number: int,
    vocab_session_id: str,
+    *,
+    ipa_mode: str = "none",
+    syllable_mode: str = "none",
 ) -> tuple:
-    """Run the full OCR pipeline on a single page image and return vocab entries.
+    """Run the full Kombi OCR pipeline on a single page and return vocab entries.

-    Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
+    Uses the same pipeline as the admin OCR Kombi pipeline:
+    orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
+    (with pipe-autocorrect, word-gap merge, dictionary detection, etc.)

    Args:
-        img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
+        img_bgr: BGR numpy array.
        page_number: 0-indexed page number.
        vocab_session_id: Vocab session ID for logging.
+        ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
+        syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".

-    Steps: deskew → dewarp → columns → rows → words → (LLM review)
    Returns (entries, rotation_deg) where entries is a list of dicts and
    rotation_deg is the orientation correction applied (0, 90, 180, 270).
    """
    import time as _time

    t_total = _time.time()
-
    img_h, img_w = img_bgr.shape[:2]
-    logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
+    logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")

-    # 1b. Orientation detection (fix upside-down scans)
+    # 1. Orientation detection (fix upside-down scans)
    t0 = _time.time()
    img_bgr, rotation = detect_and_fix_orientation(img_bgr)
    if rotation:
@@ -1414,7 +1426,7 @@ async def _run_ocr_pipeline_for_page(
    else:
        logger.info(f"  orientation: OK ({_time.time() - t0:.1f}s)")

-    # 2. Create pipeline session in DB (for debugging in admin UI)
+    # 2. Create pipeline session in DB (visible in admin Kombi UI)
    pipeline_session_id = str(uuid.uuid4())
    try:
        _, png_buf = cv2.imencode(".png", img_bgr)
@@ -1428,155 +1440,216 @@ async def _run_ocr_pipeline_for_page(
    except Exception as e:
        logger.warning(f"Could not create pipeline session in DB: {e}")

-    # 3. Three-pass deskew: iterative + word-alignment + text-line regression
+    # 3. Three-pass deskew
    t0 = _time.time()
    deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
-    angle_pass1 = deskew_debug.get("pass1_angle", 0.0)
-    angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
-    angle_pass3 = deskew_debug.get("pass3_angle", 0.0)
-
-    logger.info(f"  deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
-                f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
-                f"({_time.time() - t0:.1f}s)")
+    logger.info(f"  deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")

    # 4. Dewarp
    t0 = _time.time()
    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
    logger.info(f"  dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")

-    # 5. Column detection
+    # 5. Content crop (removes scanner borders, gutter shadows)
    t0 = _time.time()
-    ocr_img = create_ocr_image(dewarped_bgr)
-    h, w = ocr_img.shape[:2]
-
-    geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
-    if geo_result is None:
-        layout_img = create_layout_image(dewarped_bgr)
-        regions = analyze_layout(layout_img, ocr_img)
-        word_dicts = None
-        inv = None
-        content_bounds = None
-    else:
-        geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
-        content_w = right_x - left_x
-        header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
-        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
-                                          top_y=top_y, header_y=header_y, footer_y=footer_y)
-        geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
-        geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
-        content_h = bottom_y - top_y
-        regions = positional_column_regions(geometries, content_w, content_h, left_x)
-        content_bounds = (left_x, right_x, top_y, bottom_y)
-
-    logger.info(f"  columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
-
-    # 6. Row detection
-    t0 = _time.time()
-    if word_dicts is None or inv is None or content_bounds is None:
-        # Re-run geometry detection to get intermediates
-        geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
-        if geo_result2 is None:
-            raise ValueError("Column geometry detection failed — cannot detect rows")
-        _, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
-        content_bounds = (left_x, right_x, top_y, bottom_y)
-
-    left_x, right_x, top_y, bottom_y = content_bounds
-    rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
-    logger.info(f"  rows: {len(rows)} detected ({_time.time() - t0:.1f}s)")
-
-    # 7. Word recognition (cell-first OCR v2)
-    t0 = _time.time()
-    col_regions = regions  # already PageRegion objects
-
-    # Populate row.words for word_count filtering
-    for row in rows:
-        row_y_rel = row.y - top_y
-        row_bottom_rel = row_y_rel + row.height
-        row.words = [
-            wd for wd in word_dicts
-            if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
-        ]
-        row.word_count = len(row.words)
-
-    cells, columns_meta = build_cell_grid_v2(
-        ocr_img, col_regions, rows, img_w, img_h,
-        ocr_engine="auto", img_bgr=dewarped_bgr,
-    )
-
-    col_types = {c['type'] for c in columns_meta}
-    is_vocab = bool(col_types & {'column_en', 'column_de'})
-    logger.info(f"  words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)")
-
-    if not is_vocab:
-        logger.warning(f"  Page {page_number + 1}: layout is not vocab table "
-                       f"(types: {col_types}), returning empty")
-        return [], rotation
-
-    # 8. Map cells → vocab entries
-    entries = _cells_to_vocab_entries(cells, columns_meta)
-    entries = _fix_phonetic_brackets(entries, pronunciation="british")
-
-    # 9. Optional LLM review
    try:
-        review_result = await llm_review_entries(entries)
-        if review_result and review_result.get("changes"):
-            # Apply corrections
-            changes_map = {}
-            for ch in review_result["changes"]:
-                idx = ch.get("index")
-                if idx is not None:
-                    changes_map[idx] = ch
-            for idx, ch in changes_map.items():
-                if 0 <= idx < len(entries):
-                    for field in ("english", "german", "example"):
-                        if ch.get(field) and ch[field] != entries[idx].get(field):
-                            entries[idx][field] = ch[field]
-            logger.info(f"  llm review: {len(review_result['changes'])} corrections applied")
+        from page_crop import detect_and_crop_page
+        cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
+        if crop_result.get("crop_applied"):
+            dewarped_bgr = cropped_bgr
+            logger.info(f"  crop: applied ({_time.time() - t0:.1f}s)")
+        else:
+            logger.info(f"  crop: skipped ({_time.time() - t0:.1f}s)")
    except Exception as e:
-        logger.warning(f"  llm review skipped: {e}")
+        logger.warning(f"  crop: failed ({e}), continuing with uncropped image")

-    # 10. Map to frontend format
-    page_vocabulary = []
-    for entry in entries:
-        if not entry.get("english") and not entry.get("german"):
-            continue  # skip empty rows
-        page_vocabulary.append({
-            "id": str(uuid.uuid4()),
-            "english": entry.get("english", ""),
-            "german": entry.get("german", ""),
-            "example_sentence": entry.get("example", ""),
-            "source_page": page_number + 1,
+    # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
+    t0 = _time.time()
+    img_h, img_w = dewarped_bgr.shape[:2]
+
+    # RapidOCR (local ONNX)
+    try:
+        from cv_ocr_engines import ocr_region_rapid
+        from cv_vocab_types import PageRegion
+        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
+        rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
+    except Exception as e:
+        logger.warning(f"  RapidOCR failed: {e}")
+        rapid_words = []
+
+    # Tesseract
+    from PIL import Image
+    import pytesseract
+    pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
+    data = pytesseract.image_to_data(
+        pil_img, lang="eng+deu", config="--psm 6 --oem 3",
+        output_type=pytesseract.Output.DICT,
+    )
+    tess_words = []
+    for i in range(len(data["text"])):
+        text = str(data["text"][i]).strip()
+        conf_raw = str(data["conf"][i])
+        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
+        if not text or conf < 20:
+            continue
+        tess_words.append({
+            "text": text,
+            "left": data["left"][i], "top": data["top"][i],
+            "width": data["width"][i], "height": data["height"][i],
+            "conf": conf,
        })

-    # 11. Update pipeline session in DB (for admin debugging)
-    try:
-        success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr)
-        deskewed_png = dsk_buf.tobytes() if success_dsk else None
-        success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
-        dewarped_png = dwp_buf.tobytes() if success_dwp else None
+    # Merge dual-engine results
+    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
+    from cv_words_first import build_grid_from_words

+    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
+    if rapid_split or tess_words:
+        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
+        merged_words = _deduplicate_words(merged_words)
+    else:
+        merged_words = tess_words  # fallback to Tesseract only
+
+    # Build initial grid from merged words
+    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
+    for cell in cells:
+        cell["ocr_engine"] = "rapid_kombi"
+
+    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
+    n_cols = len(columns_meta)
+    logger.info(f"  ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
+                f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
+
+    # 7. Save word_result to pipeline session (needed by _build_grid_core)
+    word_result = {
+        "cells": cells,
+        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
+        "columns_used": columns_meta,
+        "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": 0,
+        "ocr_engine": "rapid_kombi",
+        "raw_tesseract_words": tess_words,
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+        },
+    }
+
+    # Save images + word_result to pipeline session for admin visibility
+    try:
+        _, dsk_buf = cv2.imencode(".png", deskewed_bgr)
+        _, dwp_buf = cv2.imencode(".png", dewarped_bgr)
        await update_pipeline_session_db(
            pipeline_session_id,
-            deskewed_png=deskewed_png,
-            dewarped_png=dewarped_png,
+            deskewed_png=dsk_buf.tobytes(),
+            dewarped_png=dwp_buf.tobytes(),
+            cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
+            word_result=word_result,
            deskew_result={"angle_applied": round(angle_applied, 3)},
            dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
-            column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y,
-                                         "width": r.width, "height": r.height}
-                                        for r in col_regions]},
-            row_result={"total_rows": len(rows)},
-            word_result={
-                "entry_count": len(page_vocabulary),
-                "layout": "vocab",
-                "vocab_entries": entries,
-            },
-            current_step=6,
+            current_step=8,
        )
    except Exception as e:
        logger.warning(f"Could not update pipeline session: {e}")

+    # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
+    t0 = _time.time()
+    try:
+        from grid_editor_api import _build_grid_core
+        session_data = {
+            "word_result": word_result,
+        }
+        grid_result = await _build_grid_core(
+            pipeline_session_id, session_data,
+            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
+        )
+        logger.info(f"  grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
+                    f"({_time.time() - t0:.1f}s)")
+
+        # Save grid result to pipeline session
+        try:
+            await update_pipeline_session_db(
+                pipeline_session_id,
+                grid_editor_result=grid_result,
+                current_step=11,
+            )
+        except Exception:
+            pass
+
+    except Exception as e:
+        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
+        grid_result = None
+
+    # 9. Extract vocab entries from grid result (zones → cells → vocab)
+    page_vocabulary = []
+
+    if grid_result and grid_result.get("zones"):
+        # Extract from the improved zone-based grid
+        for zone in grid_result["zones"]:
+            zone_cols = zone.get("columns", [])
+            zone_cells = zone.get("cells", [])
+            if not zone_cols or not zone_cells:
+                continue
+
+            # Build col_index → col_type map
+            col_type_map = {}
+            for col in zone_cols:
+                ci = col.get("col_index", col.get("index", -1))
+                col_type_map[ci] = col.get("type", col.get("col_type", ""))
+
+            # Group cells by row
+            rows_map = {}
+            for cell in zone_cells:
+                ri = cell.get("row_index", 0)
+                if ri not in rows_map:
+                    rows_map[ri] = {}
+                ci = cell.get("col_index", 0)
+                rows_map[ri][ci] = cell
+
+            for ri in sorted(rows_map.keys()):
+                row_cells = rows_map[ri]
+                en = ""
+                de = ""
+                ex = ""
+                for ci, cell in row_cells.items():
+                    ct = col_type_map.get(ci, "")
+                    text = (cell.get("text") or "").strip()
+                    if not text:
+                        continue
+                    if "en" in ct:
+                        en = text
+                    elif "de" in ct:
+                        de = text
+                    elif "example" in ct or "text" in ct:
+                        ex = text if not ex else ex + " " + text
+
+                if en or de:
+                    page_vocabulary.append({
+                        "id": str(uuid.uuid4()),
+                        "english": en,
+                        "german": de,
+                        "example_sentence": ex,
+                        "source_page": page_number + 1,
+                    })
+    else:
+        # Fallback: use basic cells → vocab entries
+        entries = _cells_to_vocab_entries(cells, columns_meta)
+        entries = _fix_phonetic_brackets(entries, pronunciation="british")
+        for entry in entries:
+            if not entry.get("english") and not entry.get("german"):
+                continue
+            page_vocabulary.append({
+                "id": str(uuid.uuid4()),
+                "english": entry.get("english", ""),
+                "german": entry.get("german", ""),
+                "example_sentence": entry.get("example", ""),
+                "source_page": page_number + 1,
+            })
+
    total_duration = _time.time() - t_total
-    logger.info(f"OCR Pipeline page {page_number + 1}: "
+    logger.info(f"Kombi Pipeline page {page_number + 1}: "
                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")

    return page_vocabulary, rotation
@@ -156,6 +156,8 @@ export default function VocabWorksheetPage() {
  const [includeSolutions, setIncludeSolutions] = useState(true)
  const [lineHeight, setLineHeight] = useState('normal')
  const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard')
+  const [showIpa, setShowIpa] = useState(false)
+  const [showSyllables, setShowSyllables] = useState(false)

  // Export state
  const [worksheetId, setWorksheetId] = useState<string | null>(null)
@@ -431,7 +433,9 @@ export default function VocabWorksheetPage() {
    const API_BASE = getApiBase()

    try {
-      const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}`, {
+      const ipaParam = showIpa ? 'auto' : 'none'
+      const syllableParam = showSyllables ? 'auto' : 'none'
+      const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}?ipa_mode=${ipaParam}&syllable_mode=${syllableParam}`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ ocr_prompts: ocrPrompts }),
@@ -1907,6 +1911,27 @@ export default function VocabWorksheetPage() {
              )}
            </div>

+            {/* OCR display options */}
+            <div className={`p-4 rounded-xl border ${isDark ? 'bg-white/5 border-white/10' : 'bg-gray-50 border-gray-200'} space-y-3`}>
+              <h4 className={`text-sm font-medium ${isDark ? 'text-white/70' : 'text-slate-600'}`}>Anzeigeoptionen</h4>
+              <div className="flex flex-col gap-2">
+                <label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
+                  <input type="checkbox" checked={showIpa} onChange={(e) => setShowIpa(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
+                  <div>
+                    <span>Lautschrift (IPA) anzeigen</span>
+                    <p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. achieve [əˈtʃiːv]</p>
+                  </div>
+                </label>
+                <label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
+                  <input type="checkbox" checked={showSyllables} onChange={(e) => setShowSyllables(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
+                  <div>
+                    <span>Silbentrennung anzeigen</span>
+                    <p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. Schmet|ter|ling</p>
+                  </div>
+                </label>
+              </div>
+            </div>
+
            <button
              onClick={generateWorksheet}
              disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating}