Replace old OCR pipeline with Kombi pipeline + add IPA/syllable toggles

Backend: - _run_ocr_pipeline_for_page() now runs the full Kombi pipeline: orientation → deskew → dewarp → content crop → dual-engine OCR (RapidOCR + Tesseract merge) → _build_grid_core() with pipe-autocorrect, word-gap merge, dictionary detection - Accepts ipa_mode and syllable_mode query params on process-single-page - Pipeline sessions are visible in admin OCR Kombi UI for debugging Frontend (vocab-worksheet): - New "Anzeigeoptionen" section with IPA and syllable toggles - Settings are passed to process-single-page as query parameters Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 00:43:42 +02:00
parent 2828871e42
commit 3b78baf37f
2 changed files with 235 additions and 137 deletions
@@ -1283,12 +1283,18 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
 async def process_single_page(
    session_id: str,
    page_number: int,
    ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
    syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
 ):
    """
-    Process a SINGLE page of an uploaded PDF using the OCR pipeline.
+    Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
-    Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words)
+    Uses the full Kombi pipeline (orientation → deskew → dewarp → crop →
-    instead of LLM vision for much better extraction quality.
+    dual-engine OCR → grid-build with autocorrect/merge) for best quality.
    Query params:
        ipa_mode: "none" (default), "auto", "all", "en", "de"
        syllable_mode: "none" (default), "auto", "all", "en", "de"
    The frontend should call this sequentially for each page.
    Returns the vocabulary for just this one page.
@@ -1316,6 +1322,7 @@ async def process_single_page(
            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
            page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
                img_bgr, page_number, session_id,
                ipa_mode=ipa_mode, syllable_mode=syllable_mode,
            )
        except Exception as e:
            logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
@@ -1384,28 +1391,33 @@ async def _run_ocr_pipeline_for_page(
    img_bgr: np.ndarray,
    page_number: int,
    vocab_session_id: str,
    *,
    ipa_mode: str = "none",
    syllable_mode: str = "none",
 ) -> tuple:
-    """Run the full OCR pipeline on a single page image and return vocab entries.
+    """Run the full Kombi OCR pipeline on a single page and return vocab entries.
-    Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
+    Uses the same pipeline as the admin OCR Kombi pipeline:
    orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
    (with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
    Args:
-        img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
+        img_bgr: BGR numpy array.
        page_number: 0-indexed page number.
        vocab_session_id: Vocab session ID for logging.
        ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
        syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
    Steps: deskew → dewarp → columns → rows → words → (LLM review)
    Returns (entries, rotation_deg) where entries is a list of dicts and
    rotation_deg is the orientation correction applied (0, 90, 180, 270).
    """
    import time as _time
    t_total = _time.time()
    img_h, img_w = img_bgr.shape[:2]
-    logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
+    logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
-    # 1b. Orientation detection (fix upside-down scans)
+    # 1. Orientation detection (fix upside-down scans)
    t0 = _time.time()
    img_bgr, rotation = detect_and_fix_orientation(img_bgr)
    if rotation:
@@ -1414,7 +1426,7 @@ async def _run_ocr_pipeline_for_page(
    else:
        logger.info(f"  orientation: OK ({_time.time() - t0:.1f}s)")
-    # 2. Create pipeline session in DB (for debugging in admin UI)
+    # 2. Create pipeline session in DB (visible in admin Kombi UI)
    pipeline_session_id = str(uuid.uuid4())
    try:
        _, png_buf = cv2.imencode(".png", img_bgr)
@@ -1428,155 +1440,216 @@ async def _run_ocr_pipeline_for_page(
    except Exception as e:
        logger.warning(f"Could not create pipeline session in DB: {e}")
-    # 3. Three-pass deskew: iterative + word-alignment + text-line regression
+    # 3. Three-pass deskew
    t0 = _time.time()
    deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
-    angle_pass1 = deskew_debug.get("pass1_angle", 0.0)
+    logger.info(f"  deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
    angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
    angle_pass3 = deskew_debug.get("pass3_angle", 0.0)
    logger.info(f"  deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
                f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
                f"({_time.time() - t0:.1f}s)")
    # 4. Dewarp
    t0 = _time.time()
    dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
    logger.info(f"  dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
-    # 5. Column detection
+    # 5. Content crop (removes scanner borders, gutter shadows)
    t0 = _time.time()
    ocr_img = create_ocr_image(dewarped_bgr)
    h, w = ocr_img.shape[:2]
    geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
    if geo_result is None:
        layout_img = create_layout_image(dewarped_bgr)
        regions = analyze_layout(layout_img, ocr_img)
        word_dicts = None
        inv = None
        content_bounds = None
    else:
        geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
        content_w = right_x - left_x
        header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
        geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
                                          top_y=top_y, header_y=header_y, footer_y=footer_y)
        geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
        geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
        content_h = bottom_y - top_y
        regions = positional_column_regions(geometries, content_w, content_h, left_x)
        content_bounds = (left_x, right_x, top_y, bottom_y)
    logger.info(f"  columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
    # 6. Row detection
    t0 = _time.time()
    if word_dicts is None or inv is None or content_bounds is None:
        # Re-run geometry detection to get intermediates
        geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
        if geo_result2 is None:
            raise ValueError("Column geometry detection failed — cannot detect rows")
        _, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
        content_bounds = (left_x, right_x, top_y, bottom_y)
    left_x, right_x, top_y, bottom_y = content_bounds
    rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
    logger.info(f"  rows: {len(rows)} detected ({_time.time() - t0:.1f}s)")
    # 7. Word recognition (cell-first OCR v2)
    t0 = _time.time()
    col_regions = regions  # already PageRegion objects
    # Populate row.words for word_count filtering
    for row in rows:
        row_y_rel = row.y - top_y
        row_bottom_rel = row_y_rel + row.height
        row.words = [
            wd for wd in word_dicts
            if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
        ]
        row.word_count = len(row.words)
    cells, columns_meta = build_cell_grid_v2(
        ocr_img, col_regions, rows, img_w, img_h,
        ocr_engine="auto", img_bgr=dewarped_bgr,
    )
    col_types = {c['type'] for c in columns_meta}
    is_vocab = bool(col_types & {'column_en', 'column_de'})
    logger.info(f"  words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)")
    if not is_vocab:
        logger.warning(f"  Page {page_number + 1}: layout is not vocab table "
                       f"(types: {col_types}), returning empty")
        return [], rotation
    # 8. Map cells → vocab entries
    entries = _cells_to_vocab_entries(cells, columns_meta)
    entries = _fix_phonetic_brackets(entries, pronunciation="british")
    # 9. Optional LLM review
    try:
-        review_result = await llm_review_entries(entries)
+        from page_crop import detect_and_crop_page
-        if review_result and review_result.get("changes"):
+        cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
-            # Apply corrections
+        if crop_result.get("crop_applied"):
-            changes_map = {}
+            dewarped_bgr = cropped_bgr
-            for ch in review_result["changes"]:
+            logger.info(f"  crop: applied ({_time.time() - t0:.1f}s)")
-                idx = ch.get("index")
+        else:
-                if idx is not None:
+            logger.info(f"  crop: skipped ({_time.time() - t0:.1f}s)")
                    changes_map[idx] = ch
            for idx, ch in changes_map.items():
                if 0 <= idx < len(entries):
                    for field in ("english", "german", "example"):
                        if ch.get(field) and ch[field] != entries[idx].get(field):
                            entries[idx][field] = ch[field]
            logger.info(f"  llm review: {len(review_result['changes'])} corrections applied")
    except Exception as e:
-        logger.warning(f"  llm review skipped: {e}")
+        logger.warning(f"  crop: failed ({e}), continuing with uncropped image")
-    # 10. Map to frontend format
+    # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
-    page_vocabulary = []
+    t0 = _time.time()
-    for entry in entries:
+    img_h, img_w = dewarped_bgr.shape[:2]
-        if not entry.get("english") and not entry.get("german"):
+
-            continue  # skip empty rows
+    # RapidOCR (local ONNX)
-        page_vocabulary.append({
+    try:
-            "id": str(uuid.uuid4()),
+        from cv_ocr_engines import ocr_region_rapid
-            "english": entry.get("english", ""),
+        from cv_vocab_types import PageRegion
-            "german": entry.get("german", ""),
+        full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
-            "example_sentence": entry.get("example", ""),
+        rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
-            "source_page": page_number + 1,
+    except Exception as e:
        logger.warning(f"  RapidOCR failed: {e}")
        rapid_words = []
    # Tesseract
    from PIL import Image
    import pytesseract
    pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
    data = pytesseract.image_to_data(
        pil_img, lang="eng+deu", config="--psm 6 --oem 3",
        output_type=pytesseract.Output.DICT,
    )
    tess_words = []
    for i in range(len(data["text"])):
        text = str(data["text"][i]).strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
        if not text or conf < 20:
            continue
        tess_words.append({
            "text": text,
            "left": data["left"][i], "top": data["top"][i],
            "width": data["width"][i], "height": data["height"][i],
            "conf": conf,
        })
-    # 11. Update pipeline session in DB (for admin debugging)
+    # Merge dual-engine results
-    try:
+    from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
-        success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr)
+    from cv_words_first import build_grid_from_words
        deskewed_png = dsk_buf.tobytes() if success_dsk else None
        success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
        dewarped_png = dwp_buf.tobytes() if success_dwp else None
    rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
    if rapid_split or tess_words:
        merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
        merged_words = _deduplicate_words(merged_words)
    else:
        merged_words = tess_words  # fallback to Tesseract only
    # Build initial grid from merged words
    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
    for cell in cells:
        cell["ocr_engine"] = "rapid_kombi"
    n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
    n_cols = len(columns_meta)
    logger.info(f"  ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
                f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
    # 7. Save word_result to pipeline session (needed by _build_grid_core)
    word_result = {
        "cells": cells,
        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
        "columns_used": columns_meta,
        "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": 0,
        "ocr_engine": "rapid_kombi",
        "raw_tesseract_words": tess_words,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
        },
    }
    # Save images + word_result to pipeline session for admin visibility
    try:
        _, dsk_buf = cv2.imencode(".png", deskewed_bgr)
        _, dwp_buf = cv2.imencode(".png", dewarped_bgr)
        await update_pipeline_session_db(
            pipeline_session_id,
-            deskewed_png=deskewed_png,
+            deskewed_png=dsk_buf.tobytes(),
-            dewarped_png=dewarped_png,
+            dewarped_png=dwp_buf.tobytes(),
            cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
            word_result=word_result,
            deskew_result={"angle_applied": round(angle_applied, 3)},
            dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
-            column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y,
+            current_step=8,
                                         "width": r.width, "height": r.height}
                                        for r in col_regions]},
            row_result={"total_rows": len(rows)},
            word_result={
                "entry_count": len(page_vocabulary),
                "layout": "vocab",
                "vocab_entries": entries,
            },
            current_step=6,
        )
    except Exception as e:
        logger.warning(f"Could not update pipeline session: {e}")
    # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
    t0 = _time.time()
    try:
        from grid_editor_api import _build_grid_core
        session_data = {
            "word_result": word_result,
        }
        grid_result = await _build_grid_core(
            pipeline_session_id, session_data,
            ipa_mode=ipa_mode, syllable_mode=syllable_mode,
        )
        logger.info(f"  grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
                    f"({_time.time() - t0:.1f}s)")
        # Save grid result to pipeline session
        try:
            await update_pipeline_session_db(
                pipeline_session_id,
                grid_editor_result=grid_result,
                current_step=11,
            )
        except Exception:
            pass
    except Exception as e:
        logger.warning(f"  grid-build failed: {e}, falling back to basic grid")
        grid_result = None
    # 9. Extract vocab entries from grid result (zones → cells → vocab)
    page_vocabulary = []
    if grid_result and grid_result.get("zones"):
        # Extract from the improved zone-based grid
        for zone in grid_result["zones"]:
            zone_cols = zone.get("columns", [])
            zone_cells = zone.get("cells", [])
            if not zone_cols or not zone_cells:
                continue
            # Build col_index → col_type map
            col_type_map = {}
            for col in zone_cols:
                ci = col.get("col_index", col.get("index", -1))
                col_type_map[ci] = col.get("type", col.get("col_type", ""))
            # Group cells by row
            rows_map = {}
            for cell in zone_cells:
                ri = cell.get("row_index", 0)
                if ri not in rows_map:
                    rows_map[ri] = {}
                ci = cell.get("col_index", 0)
                rows_map[ri][ci] = cell
            for ri in sorted(rows_map.keys()):
                row_cells = rows_map[ri]
                en = ""
                de = ""
                ex = ""
                for ci, cell in row_cells.items():
                    ct = col_type_map.get(ci, "")
                    text = (cell.get("text") or "").strip()
                    if not text:
                        continue
                    if "en" in ct:
                        en = text
                    elif "de" in ct:
                        de = text
                    elif "example" in ct or "text" in ct:
                        ex = text if not ex else ex + " " + text
                if en or de:
                    page_vocabulary.append({
                        "id": str(uuid.uuid4()),
                        "english": en,
                        "german": de,
                        "example_sentence": ex,
                        "source_page": page_number + 1,
                    })
    else:
        # Fallback: use basic cells → vocab entries
        entries = _cells_to_vocab_entries(cells, columns_meta)
        entries = _fix_phonetic_brackets(entries, pronunciation="british")
        for entry in entries:
            if not entry.get("english") and not entry.get("german"):
                continue
            page_vocabulary.append({
                "id": str(uuid.uuid4()),
                "english": entry.get("english", ""),
                "german": entry.get("german", ""),
                "example_sentence": entry.get("example", ""),
                "source_page": page_number + 1,
            })
    total_duration = _time.time() - t_total
-    logger.info(f"OCR Pipeline page {page_number + 1}: "
+    logger.info(f"Kombi Pipeline page {page_number + 1}: "
                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
    return page_vocabulary, rotation
@@ -156,6 +156,8 @@ export default function VocabWorksheetPage() {
  const [includeSolutions, setIncludeSolutions] = useState(true)
  const [lineHeight, setLineHeight] = useState('normal')
  const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard')
  const [showIpa, setShowIpa] = useState(false)
  const [showSyllables, setShowSyllables] = useState(false)
  // Export state
  const [worksheetId, setWorksheetId] = useState<string | null>(null)
@@ -431,7 +433,9 @@ export default function VocabWorksheetPage() {
    const API_BASE = getApiBase()
    try {
-      const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}`, {
+      const ipaParam = showIpa ? 'auto' : 'none'
      const syllableParam = showSyllables ? 'auto' : 'none'
      const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}?ipa_mode=${ipaParam}&syllable_mode=${syllableParam}`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ ocr_prompts: ocrPrompts }),
@@ -1907,6 +1911,27 @@ export default function VocabWorksheetPage() {
              )}
            </div>
            {/* OCR display options */}
            <div className={`p-4 rounded-xl border ${isDark ? 'bg-white/5 border-white/10' : 'bg-gray-50 border-gray-200'} space-y-3`}>
              <h4 className={`text-sm font-medium ${isDark ? 'text-white/70' : 'text-slate-600'}`}>Anzeigeoptionen</h4>
              <div className="flex flex-col gap-2">
                <label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
                  <input type="checkbox" checked={showIpa} onChange={(e) => setShowIpa(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
                  <div>
                    <span>Lautschrift (IPA) anzeigen</span>
                    <p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. achieve [əˈtʃiːv]</p>
                  </div>
                </label>
                <label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
                  <input type="checkbox" checked={showSyllables} onChange={(e) => setShowSyllables(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
                  <div>
                    <span>Silbentrennung anzeigen</span>
                    <p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. Schmet|ter|ling</p>
                  </div>
                </label>
              </div>
            </div>
            <button
              onClick={generateWorksheet}
              disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating}