diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 5f787d8..cbceaa5 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -1283,12 +1283,18 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu async def process_single_page( session_id: str, page_number: int, + ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"), + syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"), ): """ - Process a SINGLE page of an uploaded PDF using the OCR pipeline. + Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline. - Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words) - instead of LLM vision for much better extraction quality. + Uses the full Kombi pipeline (orientation → deskew → dewarp → crop → + dual-engine OCR → grid-build with autocorrect/merge) for best quality. + + Query params: + ipa_mode: "none" (default), "auto", "all", "en", "de" + syllable_mode: "none" (default), "auto", "all", "en", "de" The frontend should call this sequentially for each page. Returns the vocabulary for just this one page. @@ -1316,6 +1322,7 @@ async def process_single_page( img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page( img_bgr, page_number, session_id, + ipa_mode=ipa_mode, syllable_mode=syllable_mode, ) except Exception as e: logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True) @@ -1384,28 +1391,33 @@ async def _run_ocr_pipeline_for_page( img_bgr: np.ndarray, page_number: int, vocab_session_id: str, + *, + ipa_mode: str = "none", + syllable_mode: str = "none", ) -> tuple: - """Run the full OCR pipeline on a single page image and return vocab entries. + """Run the full Kombi OCR pipeline on a single page and return vocab entries. - Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py). + Uses the same pipeline as the admin OCR Kombi pipeline: + orientation → deskew → dewarp → crop → dual-engine OCR → grid-build + (with pipe-autocorrect, word-gap merge, dictionary detection, etc.) Args: - img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline). + img_bgr: BGR numpy array. page_number: 0-indexed page number. vocab_session_id: Vocab session ID for logging. + ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de". + syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de". - Steps: deskew → dewarp → columns → rows → words → (LLM review) Returns (entries, rotation_deg) where entries is a list of dicts and rotation_deg is the orientation correction applied (0, 90, 180, 270). """ import time as _time t_total = _time.time() - img_h, img_w = img_bgr.shape[:2] - logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}") + logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}") - # 1b. Orientation detection (fix upside-down scans) + # 1. Orientation detection (fix upside-down scans) t0 = _time.time() img_bgr, rotation = detect_and_fix_orientation(img_bgr) if rotation: @@ -1414,7 +1426,7 @@ async def _run_ocr_pipeline_for_page( else: logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)") - # 2. Create pipeline session in DB (for debugging in admin UI) + # 2. Create pipeline session in DB (visible in admin Kombi UI) pipeline_session_id = str(uuid.uuid4()) try: _, png_buf = cv2.imencode(".png", img_bgr) @@ -1428,155 +1440,216 @@ async def _run_ocr_pipeline_for_page( except Exception as e: logger.warning(f"Could not create pipeline session in DB: {e}") - # 3. Three-pass deskew: iterative + word-alignment + text-line regression + # 3. Three-pass deskew t0 = _time.time() deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy()) - angle_pass1 = deskew_debug.get("pass1_angle", 0.0) - angle_pass2 = deskew_debug.get("pass2_angle", 0.0) - angle_pass3 = deskew_debug.get("pass3_angle", 0.0) - - logger.info(f" deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} " - f"p3={angle_pass3:.2f} total={angle_applied:.2f} " - f"({_time.time() - t0:.1f}s)") + logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)") # 4. Dewarp t0 = _time.time() dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr) logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)") - # 5. Column detection + # 5. Content crop (removes scanner borders, gutter shadows) t0 = _time.time() - ocr_img = create_ocr_image(dewarped_bgr) - h, w = ocr_img.shape[:2] - - geo_result = detect_column_geometry(ocr_img, dewarped_bgr) - if geo_result is None: - layout_img = create_layout_image(dewarped_bgr) - regions = analyze_layout(layout_img, ocr_img) - word_dicts = None - inv = None - content_bounds = None - else: - geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result - content_w = right_x - left_x - header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None) - geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, - top_y=top_y, header_y=header_y, footer_y=footer_y) - geometries = _split_broad_columns(geometries, content_w, left_x=left_x) - geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts) - content_h = bottom_y - top_y - regions = positional_column_regions(geometries, content_w, content_h, left_x) - content_bounds = (left_x, right_x, top_y, bottom_y) - - logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)") - - # 6. Row detection - t0 = _time.time() - if word_dicts is None or inv is None or content_bounds is None: - # Re-run geometry detection to get intermediates - geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr) - if geo_result2 is None: - raise ValueError("Column geometry detection failed — cannot detect rows") - _, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2 - content_bounds = (left_x, right_x, top_y, bottom_y) - - left_x, right_x, top_y, bottom_y = content_bounds - rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y) - logger.info(f" rows: {len(rows)} detected ({_time.time() - t0:.1f}s)") - - # 7. Word recognition (cell-first OCR v2) - t0 = _time.time() - col_regions = regions # already PageRegion objects - - # Populate row.words for word_count filtering - for row in rows: - row_y_rel = row.y - top_y - row_bottom_rel = row_y_rel + row.height - row.words = [ - wd for wd in word_dicts - if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel - ] - row.word_count = len(row.words) - - cells, columns_meta = build_cell_grid_v2( - ocr_img, col_regions, rows, img_w, img_h, - ocr_engine="auto", img_bgr=dewarped_bgr, - ) - - col_types = {c['type'] for c in columns_meta} - is_vocab = bool(col_types & {'column_en', 'column_de'}) - logger.info(f" words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)") - - if not is_vocab: - logger.warning(f" Page {page_number + 1}: layout is not vocab table " - f"(types: {col_types}), returning empty") - return [], rotation - - # 8. Map cells → vocab entries - entries = _cells_to_vocab_entries(cells, columns_meta) - entries = _fix_phonetic_brackets(entries, pronunciation="british") - - # 9. Optional LLM review try: - review_result = await llm_review_entries(entries) - if review_result and review_result.get("changes"): - # Apply corrections - changes_map = {} - for ch in review_result["changes"]: - idx = ch.get("index") - if idx is not None: - changes_map[idx] = ch - for idx, ch in changes_map.items(): - if 0 <= idx < len(entries): - for field in ("english", "german", "example"): - if ch.get(field) and ch[field] != entries[idx].get(field): - entries[idx][field] = ch[field] - logger.info(f" llm review: {len(review_result['changes'])} corrections applied") + from page_crop import detect_and_crop_page + cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr) + if crop_result.get("crop_applied"): + dewarped_bgr = cropped_bgr + logger.info(f" crop: applied ({_time.time() - t0:.1f}s)") + else: + logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)") except Exception as e: - logger.warning(f" llm review skipped: {e}") + logger.warning(f" crop: failed ({e}), continuing with uncropped image") - # 10. Map to frontend format - page_vocabulary = [] - for entry in entries: - if not entry.get("english") and not entry.get("german"): - continue # skip empty rows - page_vocabulary.append({ - "id": str(uuid.uuid4()), - "english": entry.get("english", ""), - "german": entry.get("german", ""), - "example_sentence": entry.get("example", ""), - "source_page": page_number + 1, + # 6. Dual-engine OCR (RapidOCR + Tesseract → merge) + t0 = _time.time() + img_h, img_w = dewarped_bgr.shape[:2] + + # RapidOCR (local ONNX) + try: + from cv_ocr_engines import ocr_region_rapid + from cv_vocab_types import PageRegion + full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h) + rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or [] + except Exception as e: + logger.warning(f" RapidOCR failed: {e}") + rapid_words = [] + + # Tesseract + from PIL import Image + import pytesseract + pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB)) + data = pytesseract.image_to_data( + pil_img, lang="eng+deu", config="--psm 6 --oem 3", + output_type=pytesseract.Output.DICT, + ) + tess_words = [] + for i in range(len(data["text"])): + text = str(data["text"][i]).strip() + conf_raw = str(data["conf"][i]) + conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 + if not text or conf < 20: + continue + tess_words.append({ + "text": text, + "left": data["left"][i], "top": data["top"][i], + "width": data["width"][i], "height": data["height"][i], + "conf": conf, }) - # 11. Update pipeline session in DB (for admin debugging) - try: - success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr) - deskewed_png = dsk_buf.tobytes() if success_dsk else None - success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr) - dewarped_png = dwp_buf.tobytes() if success_dwp else None + # Merge dual-engine results + from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words + from cv_words_first import build_grid_from_words + rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else [] + if rapid_split or tess_words: + merged_words = _merge_paddle_tesseract(rapid_split, tess_words) + merged_words = _deduplicate_words(merged_words) + else: + merged_words = tess_words # fallback to Tesseract only + + # Build initial grid from merged words + cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) + for cell in cells: + cell["ocr_engine"] = "rapid_kombi" + + n_rows = len(set(c["row_index"] for c in cells)) if cells else 0 + n_cols = len(columns_meta) + logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, " + f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)") + + # 7. Save word_result to pipeline session (needed by _build_grid_core) + word_result = { + "cells": cells, + "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)}, + "columns_used": columns_meta, + "layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic", + "image_width": img_w, + "image_height": img_h, + "duration_seconds": 0, + "ocr_engine": "rapid_kombi", + "raw_tesseract_words": tess_words, + "summary": { + "total_cells": len(cells), + "non_empty_cells": sum(1 for c in cells if c.get("text")), + }, + } + + # Save images + word_result to pipeline session for admin visibility + try: + _, dsk_buf = cv2.imencode(".png", deskewed_bgr) + _, dwp_buf = cv2.imencode(".png", dewarped_bgr) await update_pipeline_session_db( pipeline_session_id, - deskewed_png=deskewed_png, - dewarped_png=dewarped_png, + deskewed_png=dsk_buf.tobytes(), + dewarped_png=dwp_buf.tobytes(), + cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(), + word_result=word_result, deskew_result={"angle_applied": round(angle_applied, 3)}, dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)}, - column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y, - "width": r.width, "height": r.height} - for r in col_regions]}, - row_result={"total_rows": len(rows)}, - word_result={ - "entry_count": len(page_vocabulary), - "layout": "vocab", - "vocab_entries": entries, - }, - current_step=6, + current_step=8, ) except Exception as e: logger.warning(f"Could not update pipeline session: {e}") + # 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.) + t0 = _time.time() + try: + from grid_editor_api import _build_grid_core + session_data = { + "word_result": word_result, + } + grid_result = await _build_grid_core( + pipeline_session_id, session_data, + ipa_mode=ipa_mode, syllable_mode=syllable_mode, + ) + logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells " + f"({_time.time() - t0:.1f}s)") + + # Save grid result to pipeline session + try: + await update_pipeline_session_db( + pipeline_session_id, + grid_editor_result=grid_result, + current_step=11, + ) + except Exception: + pass + + except Exception as e: + logger.warning(f" grid-build failed: {e}, falling back to basic grid") + grid_result = None + + # 9. Extract vocab entries from grid result (zones → cells → vocab) + page_vocabulary = [] + + if grid_result and grid_result.get("zones"): + # Extract from the improved zone-based grid + for zone in grid_result["zones"]: + zone_cols = zone.get("columns", []) + zone_cells = zone.get("cells", []) + if not zone_cols or not zone_cells: + continue + + # Build col_index → col_type map + col_type_map = {} + for col in zone_cols: + ci = col.get("col_index", col.get("index", -1)) + col_type_map[ci] = col.get("type", col.get("col_type", "")) + + # Group cells by row + rows_map = {} + for cell in zone_cells: + ri = cell.get("row_index", 0) + if ri not in rows_map: + rows_map[ri] = {} + ci = cell.get("col_index", 0) + rows_map[ri][ci] = cell + + for ri in sorted(rows_map.keys()): + row_cells = rows_map[ri] + en = "" + de = "" + ex = "" + for ci, cell in row_cells.items(): + ct = col_type_map.get(ci, "") + text = (cell.get("text") or "").strip() + if not text: + continue + if "en" in ct: + en = text + elif "de" in ct: + de = text + elif "example" in ct or "text" in ct: + ex = text if not ex else ex + " " + text + + if en or de: + page_vocabulary.append({ + "id": str(uuid.uuid4()), + "english": en, + "german": de, + "example_sentence": ex, + "source_page": page_number + 1, + }) + else: + # Fallback: use basic cells → vocab entries + entries = _cells_to_vocab_entries(cells, columns_meta) + entries = _fix_phonetic_brackets(entries, pronunciation="british") + for entry in entries: + if not entry.get("english") and not entry.get("german"): + continue + page_vocabulary.append({ + "id": str(uuid.uuid4()), + "english": entry.get("english", ""), + "german": entry.get("german", ""), + "example_sentence": entry.get("example", ""), + "source_page": page_number + 1, + }) + total_duration = _time.time() - t_total - logger.info(f"OCR Pipeline page {page_number + 1}: " + logger.info(f"Kombi Pipeline page {page_number + 1}: " f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s") return page_vocabulary, rotation diff --git a/studio-v2/app/vocab-worksheet/page.tsx b/studio-v2/app/vocab-worksheet/page.tsx index 77ef950..2516b7e 100644 --- a/studio-v2/app/vocab-worksheet/page.tsx +++ b/studio-v2/app/vocab-worksheet/page.tsx @@ -156,6 +156,8 @@ export default function VocabWorksheetPage() { const [includeSolutions, setIncludeSolutions] = useState(true) const [lineHeight, setLineHeight] = useState('normal') const [selectedFormat, setSelectedFormat] = useState('standard') + const [showIpa, setShowIpa] = useState(false) + const [showSyllables, setShowSyllables] = useState(false) // Export state const [worksheetId, setWorksheetId] = useState(null) @@ -431,7 +433,9 @@ export default function VocabWorksheetPage() { const API_BASE = getApiBase() try { - const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}`, { + const ipaParam = showIpa ? 'auto' : 'none' + const syllableParam = showSyllables ? 'auto' : 'none' + const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}?ipa_mode=${ipaParam}&syllable_mode=${syllableParam}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ ocr_prompts: ocrPrompts }), @@ -1907,6 +1911,27 @@ export default function VocabWorksheetPage() { )} + {/* OCR display options */} +
+

Anzeigeoptionen

+
+ + +
+
+