Replace old OCR pipeline with Kombi pipeline + add IPA/syllable toggles
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 37s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 32s
CI / test-nodejs-website (push) Successful in 33s

Backend:
- _run_ocr_pipeline_for_page() now runs the full Kombi pipeline:
  orientation → deskew → dewarp → content crop → dual-engine OCR
  (RapidOCR + Tesseract merge) → _build_grid_core() with pipe-autocorrect,
  word-gap merge, dictionary detection
- Accepts ipa_mode and syllable_mode query params on process-single-page
- Pipeline sessions are visible in admin OCR Kombi UI for debugging

Frontend (vocab-worksheet):
- New "Anzeigeoptionen" section with IPA and syllable toggles
- Settings are passed to process-single-page as query parameters

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-11 00:43:42 +02:00
parent 2828871e42
commit 3b78baf37f
2 changed files with 235 additions and 137 deletions

View File

@@ -1283,12 +1283,18 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
async def process_single_page( async def process_single_page(
session_id: str, session_id: str,
page_number: int, page_number: int,
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
): ):
""" """
Process a SINGLE page of an uploaded PDF using the OCR pipeline. Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words) Uses the full Kombi pipeline (orientation → deskew → dewarp → crop →
instead of LLM vision for much better extraction quality. dual-engine OCR → grid-build with autocorrect/merge) for best quality.
Query params:
ipa_mode: "none" (default), "auto", "all", "en", "de"
syllable_mode: "none" (default), "auto", "all", "en", "de"
The frontend should call this sequentially for each page. The frontend should call this sequentially for each page.
Returns the vocabulary for just this one page. Returns the vocabulary for just this one page.
@@ -1316,6 +1322,7 @@ async def process_single_page(
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page( page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id, img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
) )
except Exception as e: except Exception as e:
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True) logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
@@ -1384,28 +1391,33 @@ async def _run_ocr_pipeline_for_page(
img_bgr: np.ndarray, img_bgr: np.ndarray,
page_number: int, page_number: int,
vocab_session_id: str, vocab_session_id: str,
*,
ipa_mode: str = "none",
syllable_mode: str = "none",
) -> tuple: ) -> tuple:
"""Run the full OCR pipeline on a single page image and return vocab entries. """Run the full Kombi OCR pipeline on a single page and return vocab entries.
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py). Uses the same pipeline as the admin OCR Kombi pipeline:
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
Args: Args:
img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline). img_bgr: BGR numpy array.
page_number: 0-indexed page number. page_number: 0-indexed page number.
vocab_session_id: Vocab session ID for logging. vocab_session_id: Vocab session ID for logging.
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
Steps: deskew → dewarp → columns → rows → words → (LLM review)
Returns (entries, rotation_deg) where entries is a list of dicts and Returns (entries, rotation_deg) where entries is a list of dicts and
rotation_deg is the orientation correction applied (0, 90, 180, 270). rotation_deg is the orientation correction applied (0, 90, 180, 270).
""" """
import time as _time import time as _time
t_total = _time.time() t_total = _time.time()
img_h, img_w = img_bgr.shape[:2] img_h, img_w = img_bgr.shape[:2]
logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}") logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
# 1b. Orientation detection (fix upside-down scans) # 1. Orientation detection (fix upside-down scans)
t0 = _time.time() t0 = _time.time()
img_bgr, rotation = detect_and_fix_orientation(img_bgr) img_bgr, rotation = detect_and_fix_orientation(img_bgr)
if rotation: if rotation:
@@ -1414,7 +1426,7 @@ async def _run_ocr_pipeline_for_page(
else: else:
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)") logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
# 2. Create pipeline session in DB (for debugging in admin UI) # 2. Create pipeline session in DB (visible in admin Kombi UI)
pipeline_session_id = str(uuid.uuid4()) pipeline_session_id = str(uuid.uuid4())
try: try:
_, png_buf = cv2.imencode(".png", img_bgr) _, png_buf = cv2.imencode(".png", img_bgr)
@@ -1428,118 +1440,206 @@ async def _run_ocr_pipeline_for_page(
except Exception as e: except Exception as e:
logger.warning(f"Could not create pipeline session in DB: {e}") logger.warning(f"Could not create pipeline session in DB: {e}")
# 3. Three-pass deskew: iterative + word-alignment + text-line regression # 3. Three-pass deskew
t0 = _time.time() t0 = _time.time()
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy()) deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
angle_pass1 = deskew_debug.get("pass1_angle", 0.0) logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
angle_pass3 = deskew_debug.get("pass3_angle", 0.0)
logger.info(f" deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
f"({_time.time() - t0:.1f}s)")
# 4. Dewarp # 4. Dewarp
t0 = _time.time() t0 = _time.time()
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr) dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)") logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
# 5. Column detection # 5. Content crop (removes scanner borders, gutter shadows)
t0 = _time.time() t0 = _time.time()
ocr_img = create_ocr_image(dewarped_bgr) try:
h, w = ocr_img.shape[:2] from page_crop import detect_and_crop_page
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img, dewarped_bgr) if crop_result.get("crop_applied"):
if geo_result is None: dewarped_bgr = cropped_bgr
layout_img = create_layout_image(dewarped_bgr) logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
regions = analyze_layout(layout_img, ocr_img)
word_dicts = None
inv = None
content_bounds = None
else: else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
content_w = right_x - left_x except Exception as e:
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None) logger.warning(f" crop: failed ({e}), continuing with uncropped image")
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
content_h = bottom_y - top_y
regions = positional_column_regions(geometries, content_w, content_h, left_x)
content_bounds = (left_x, right_x, top_y, bottom_y)
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)") # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
# 6. Row detection
t0 = _time.time() t0 = _time.time()
if word_dicts is None or inv is None or content_bounds is None: img_h, img_w = dewarped_bgr.shape[:2]
# Re-run geometry detection to get intermediates
geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result2 is None:
raise ValueError("Column geometry detection failed — cannot detect rows")
_, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
content_bounds = (left_x, right_x, top_y, bottom_y)
left_x, right_x, top_y, bottom_y = content_bounds # RapidOCR (local ONNX)
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y) try:
logger.info(f" rows: {len(rows)} detected ({_time.time() - t0:.1f}s)") from cv_ocr_engines import ocr_region_rapid
from cv_vocab_types import PageRegion
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
except Exception as e:
logger.warning(f" RapidOCR failed: {e}")
rapid_words = []
# 7. Word recognition (cell-first OCR v2) # Tesseract
t0 = _time.time() from PIL import Image
col_regions = regions # already PageRegion objects import pytesseract
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
# Populate row.words for word_count filtering data = pytesseract.image_to_data(
for row in rows: pil_img, lang="eng+deu", config="--psm 6 --oem 3",
row_y_rel = row.y - top_y output_type=pytesseract.Output.DICT,
row_bottom_rel = row_y_rel + row.height
row.words = [
wd for wd in word_dicts
if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
]
row.word_count = len(row.words)
cells, columns_meta = build_cell_grid_v2(
ocr_img, col_regions, rows, img_w, img_h,
ocr_engine="auto", img_bgr=dewarped_bgr,
) )
tess_words = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < 20:
continue
tess_words.append({
"text": text,
"left": data["left"][i], "top": data["top"][i],
"width": data["width"][i], "height": data["height"][i],
"conf": conf,
})
col_types = {c['type'] for c in columns_meta} # Merge dual-engine results
is_vocab = bool(col_types & {'column_en', 'column_de'}) from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
logger.info(f" words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)") from cv_words_first import build_grid_from_words
if not is_vocab: rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
logger.warning(f" Page {page_number + 1}: layout is not vocab table " if rapid_split or tess_words:
f"(types: {col_types}), returning empty") merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
return [], rotation merged_words = _deduplicate_words(merged_words)
else:
merged_words = tess_words # fallback to Tesseract only
# 8. Map cells → vocab entries # Build initial grid from merged words
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
# 7. Save word_result to pipeline session (needed by _build_grid_core)
word_result = {
"cells": cells,
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": 0,
"ocr_engine": "rapid_kombi",
"raw_tesseract_words": tess_words,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
},
}
# Save images + word_result to pipeline session for admin visibility
try:
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
await update_pipeline_session_db(
pipeline_session_id,
deskewed_png=dsk_buf.tobytes(),
dewarped_png=dwp_buf.tobytes(),
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
word_result=word_result,
deskew_result={"angle_applied": round(angle_applied, 3)},
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
current_step=8,
)
except Exception as e:
logger.warning(f"Could not update pipeline session: {e}")
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
t0 = _time.time()
try:
from grid_editor_api import _build_grid_core
session_data = {
"word_result": word_result,
}
grid_result = await _build_grid_core(
pipeline_session_id, session_data,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
f"({_time.time() - t0:.1f}s)")
# Save grid result to pipeline session
try:
await update_pipeline_session_db(
pipeline_session_id,
grid_editor_result=grid_result,
current_step=11,
)
except Exception:
pass
except Exception as e:
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
grid_result = None
# 9. Extract vocab entries from grid result (zones → cells → vocab)
page_vocabulary = []
if grid_result and grid_result.get("zones"):
# Extract from the improved zone-based grid
for zone in grid_result["zones"]:
zone_cols = zone.get("columns", [])
zone_cells = zone.get("cells", [])
if not zone_cols or not zone_cells:
continue
# Build col_index → col_type map
col_type_map = {}
for col in zone_cols:
ci = col.get("col_index", col.get("index", -1))
col_type_map[ci] = col.get("type", col.get("col_type", ""))
# Group cells by row
rows_map = {}
for cell in zone_cells:
ri = cell.get("row_index", 0)
if ri not in rows_map:
rows_map[ri] = {}
ci = cell.get("col_index", 0)
rows_map[ri][ci] = cell
for ri in sorted(rows_map.keys()):
row_cells = rows_map[ri]
en = ""
de = ""
ex = ""
for ci, cell in row_cells.items():
ct = col_type_map.get(ci, "")
text = (cell.get("text") or "").strip()
if not text:
continue
if "en" in ct:
en = text
elif "de" in ct:
de = text
elif "example" in ct or "text" in ct:
ex = text if not ex else ex + " " + text
if en or de:
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": en,
"german": de,
"example_sentence": ex,
"source_page": page_number + 1,
})
else:
# Fallback: use basic cells → vocab entries
entries = _cells_to_vocab_entries(cells, columns_meta) entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation="british") entries = _fix_phonetic_brackets(entries, pronunciation="british")
# 9. Optional LLM review
try:
review_result = await llm_review_entries(entries)
if review_result and review_result.get("changes"):
# Apply corrections
changes_map = {}
for ch in review_result["changes"]:
idx = ch.get("index")
if idx is not None:
changes_map[idx] = ch
for idx, ch in changes_map.items():
if 0 <= idx < len(entries):
for field in ("english", "german", "example"):
if ch.get(field) and ch[field] != entries[idx].get(field):
entries[idx][field] = ch[field]
logger.info(f" llm review: {len(review_result['changes'])} corrections applied")
except Exception as e:
logger.warning(f" llm review skipped: {e}")
# 10. Map to frontend format
page_vocabulary = []
for entry in entries: for entry in entries:
if not entry.get("english") and not entry.get("german"): if not entry.get("english") and not entry.get("german"):
continue # skip empty rows continue
page_vocabulary.append({ page_vocabulary.append({
"id": str(uuid.uuid4()), "id": str(uuid.uuid4()),
"english": entry.get("english", ""), "english": entry.get("english", ""),
@@ -1548,35 +1648,8 @@ async def _run_ocr_pipeline_for_page(
"source_page": page_number + 1, "source_page": page_number + 1,
}) })
# 11. Update pipeline session in DB (for admin debugging)
try:
success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr)
deskewed_png = dsk_buf.tobytes() if success_dsk else None
success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = dwp_buf.tobytes() if success_dwp else None
await update_pipeline_session_db(
pipeline_session_id,
deskewed_png=deskewed_png,
dewarped_png=dewarped_png,
deskew_result={"angle_applied": round(angle_applied, 3)},
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y,
"width": r.width, "height": r.height}
for r in col_regions]},
row_result={"total_rows": len(rows)},
word_result={
"entry_count": len(page_vocabulary),
"layout": "vocab",
"vocab_entries": entries,
},
current_step=6,
)
except Exception as e:
logger.warning(f"Could not update pipeline session: {e}")
total_duration = _time.time() - t_total total_duration = _time.time() - t_total
logger.info(f"OCR Pipeline page {page_number + 1}: " logger.info(f"Kombi Pipeline page {page_number + 1}: "
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s") f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
return page_vocabulary, rotation return page_vocabulary, rotation

View File

@@ -156,6 +156,8 @@ export default function VocabWorksheetPage() {
const [includeSolutions, setIncludeSolutions] = useState(true) const [includeSolutions, setIncludeSolutions] = useState(true)
const [lineHeight, setLineHeight] = useState('normal') const [lineHeight, setLineHeight] = useState('normal')
const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard') const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard')
const [showIpa, setShowIpa] = useState(false)
const [showSyllables, setShowSyllables] = useState(false)
// Export state // Export state
const [worksheetId, setWorksheetId] = useState<string | null>(null) const [worksheetId, setWorksheetId] = useState<string | null>(null)
@@ -431,7 +433,9 @@ export default function VocabWorksheetPage() {
const API_BASE = getApiBase() const API_BASE = getApiBase()
try { try {
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}`, { const ipaParam = showIpa ? 'auto' : 'none'
const syllableParam = showSyllables ? 'auto' : 'none'
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}?ipa_mode=${ipaParam}&syllable_mode=${syllableParam}`, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ ocr_prompts: ocrPrompts }), body: JSON.stringify({ ocr_prompts: ocrPrompts }),
@@ -1907,6 +1911,27 @@ export default function VocabWorksheetPage() {
)} )}
</div> </div>
{/* OCR display options */}
<div className={`p-4 rounded-xl border ${isDark ? 'bg-white/5 border-white/10' : 'bg-gray-50 border-gray-200'} space-y-3`}>
<h4 className={`text-sm font-medium ${isDark ? 'text-white/70' : 'text-slate-600'}`}>Anzeigeoptionen</h4>
<div className="flex flex-col gap-2">
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
<input type="checkbox" checked={showIpa} onChange={(e) => setShowIpa(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
<div>
<span>Lautschrift (IPA) anzeigen</span>
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. achieve [əˈtʃiːv]</p>
</div>
</label>
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
<input type="checkbox" checked={showSyllables} onChange={(e) => setShowSyllables(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
<div>
<span>Silbentrennung anzeigen</span>
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. Schmet|ter|ling</p>
</div>
</label>
</div>
</div>
<button <button
onClick={generateWorksheet} onClick={generateWorksheet}
disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating} disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating}