Replace old OCR pipeline with Kombi pipeline + add IPA/syllable toggles
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 37s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 32s
CI / test-nodejs-website (push) Successful in 33s

Backend:
- _run_ocr_pipeline_for_page() now runs the full Kombi pipeline:
  orientation → deskew → dewarp → content crop → dual-engine OCR
  (RapidOCR + Tesseract merge) → _build_grid_core() with pipe-autocorrect,
  word-gap merge, dictionary detection
- Accepts ipa_mode and syllable_mode query params on process-single-page
- Pipeline sessions are visible in admin OCR Kombi UI for debugging

Frontend (vocab-worksheet):
- New "Anzeigeoptionen" section with IPA and syllable toggles
- Settings are passed to process-single-page as query parameters

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-11 00:43:42 +02:00
parent 2828871e42
commit 3b78baf37f
2 changed files with 235 additions and 137 deletions

View File

@@ -1283,12 +1283,18 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
async def process_single_page(
session_id: str,
page_number: int,
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
):
"""
Process a SINGLE page of an uploaded PDF using the OCR pipeline.
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words)
instead of LLM vision for much better extraction quality.
Uses the full Kombi pipeline (orientation → deskew → dewarp → crop →
dual-engine OCR → grid-build with autocorrect/merge) for best quality.
Query params:
ipa_mode: "none" (default), "auto", "all", "en", "de"
syllable_mode: "none" (default), "auto", "all", "en", "de"
The frontend should call this sequentially for each page.
Returns the vocabulary for just this one page.
@@ -1316,6 +1322,7 @@ async def process_single_page(
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
except Exception as e:
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
@@ -1384,28 +1391,33 @@ async def _run_ocr_pipeline_for_page(
img_bgr: np.ndarray,
page_number: int,
vocab_session_id: str,
*,
ipa_mode: str = "none",
syllable_mode: str = "none",
) -> tuple:
"""Run the full OCR pipeline on a single page image and return vocab entries.
"""Run the full Kombi OCR pipeline on a single page and return vocab entries.
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
Uses the same pipeline as the admin OCR Kombi pipeline:
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
Args:
img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
img_bgr: BGR numpy array.
page_number: 0-indexed page number.
vocab_session_id: Vocab session ID for logging.
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
Steps: deskew → dewarp → columns → rows → words → (LLM review)
Returns (entries, rotation_deg) where entries is a list of dicts and
rotation_deg is the orientation correction applied (0, 90, 180, 270).
"""
import time as _time
t_total = _time.time()
img_h, img_w = img_bgr.shape[:2]
logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
# 1b. Orientation detection (fix upside-down scans)
# 1. Orientation detection (fix upside-down scans)
t0 = _time.time()
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
if rotation:
@@ -1414,7 +1426,7 @@ async def _run_ocr_pipeline_for_page(
else:
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
# 2. Create pipeline session in DB (for debugging in admin UI)
# 2. Create pipeline session in DB (visible in admin Kombi UI)
pipeline_session_id = str(uuid.uuid4())
try:
_, png_buf = cv2.imencode(".png", img_bgr)
@@ -1428,155 +1440,216 @@ async def _run_ocr_pipeline_for_page(
except Exception as e:
logger.warning(f"Could not create pipeline session in DB: {e}")
# 3. Three-pass deskew: iterative + word-alignment + text-line regression
# 3. Three-pass deskew
t0 = _time.time()
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
angle_pass1 = deskew_debug.get("pass1_angle", 0.0)
angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
angle_pass3 = deskew_debug.get("pass3_angle", 0.0)
logger.info(f" deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
f"({_time.time() - t0:.1f}s)")
logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
# 4. Dewarp
t0 = _time.time()
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
# 5. Column detection
# 5. Content crop (removes scanner borders, gutter shadows)
t0 = _time.time()
ocr_img = create_ocr_image(dewarped_bgr)
h, w = ocr_img.shape[:2]
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result is None:
layout_img = create_layout_image(dewarped_bgr)
regions = analyze_layout(layout_img, ocr_img)
word_dicts = None
inv = None
content_bounds = None
else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
content_w = right_x - left_x
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
content_h = bottom_y - top_y
regions = positional_column_regions(geometries, content_w, content_h, left_x)
content_bounds = (left_x, right_x, top_y, bottom_y)
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
# 6. Row detection
t0 = _time.time()
if word_dicts is None or inv is None or content_bounds is None:
# Re-run geometry detection to get intermediates
geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result2 is None:
raise ValueError("Column geometry detection failed — cannot detect rows")
_, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
content_bounds = (left_x, right_x, top_y, bottom_y)
left_x, right_x, top_y, bottom_y = content_bounds
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
logger.info(f" rows: {len(rows)} detected ({_time.time() - t0:.1f}s)")
# 7. Word recognition (cell-first OCR v2)
t0 = _time.time()
col_regions = regions # already PageRegion objects
# Populate row.words for word_count filtering
for row in rows:
row_y_rel = row.y - top_y
row_bottom_rel = row_y_rel + row.height
row.words = [
wd for wd in word_dicts
if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
]
row.word_count = len(row.words)
cells, columns_meta = build_cell_grid_v2(
ocr_img, col_regions, rows, img_w, img_h,
ocr_engine="auto", img_bgr=dewarped_bgr,
)
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
logger.info(f" words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)")
if not is_vocab:
logger.warning(f" Page {page_number + 1}: layout is not vocab table "
f"(types: {col_types}), returning empty")
return [], rotation
# 8. Map cells → vocab entries
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation="british")
# 9. Optional LLM review
try:
review_result = await llm_review_entries(entries)
if review_result and review_result.get("changes"):
# Apply corrections
changes_map = {}
for ch in review_result["changes"]:
idx = ch.get("index")
if idx is not None:
changes_map[idx] = ch
for idx, ch in changes_map.items():
if 0 <= idx < len(entries):
for field in ("english", "german", "example"):
if ch.get(field) and ch[field] != entries[idx].get(field):
entries[idx][field] = ch[field]
logger.info(f" llm review: {len(review_result['changes'])} corrections applied")
from page_crop import detect_and_crop_page
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
if crop_result.get("crop_applied"):
dewarped_bgr = cropped_bgr
logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
else:
logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
except Exception as e:
logger.warning(f" llm review skipped: {e}")
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
# 10. Map to frontend format
page_vocabulary = []
for entry in entries:
if not entry.get("english") and not entry.get("german"):
continue # skip empty rows
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": entry.get("english", ""),
"german": entry.get("german", ""),
"example_sentence": entry.get("example", ""),
"source_page": page_number + 1,
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
t0 = _time.time()
img_h, img_w = dewarped_bgr.shape[:2]
# RapidOCR (local ONNX)
try:
from cv_ocr_engines import ocr_region_rapid
from cv_vocab_types import PageRegion
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
except Exception as e:
logger.warning(f" RapidOCR failed: {e}")
rapid_words = []
# Tesseract
from PIL import Image
import pytesseract
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
data = pytesseract.image_to_data(
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
output_type=pytesseract.Output.DICT,
)
tess_words = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < 20:
continue
tess_words.append({
"text": text,
"left": data["left"][i], "top": data["top"][i],
"width": data["width"][i], "height": data["height"][i],
"conf": conf,
})
# 11. Update pipeline session in DB (for admin debugging)
try:
success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr)
deskewed_png = dsk_buf.tobytes() if success_dsk else None
success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = dwp_buf.tobytes() if success_dwp else None
# Merge dual-engine results
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
from cv_words_first import build_grid_from_words
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
if rapid_split or tess_words:
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
merged_words = _deduplicate_words(merged_words)
else:
merged_words = tess_words # fallback to Tesseract only
# Build initial grid from merged words
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
# 7. Save word_result to pipeline session (needed by _build_grid_core)
word_result = {
"cells": cells,
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": 0,
"ocr_engine": "rapid_kombi",
"raw_tesseract_words": tess_words,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
},
}
# Save images + word_result to pipeline session for admin visibility
try:
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
await update_pipeline_session_db(
pipeline_session_id,
deskewed_png=deskewed_png,
dewarped_png=dewarped_png,
deskewed_png=dsk_buf.tobytes(),
dewarped_png=dwp_buf.tobytes(),
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
word_result=word_result,
deskew_result={"angle_applied": round(angle_applied, 3)},
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y,
"width": r.width, "height": r.height}
for r in col_regions]},
row_result={"total_rows": len(rows)},
word_result={
"entry_count": len(page_vocabulary),
"layout": "vocab",
"vocab_entries": entries,
},
current_step=6,
current_step=8,
)
except Exception as e:
logger.warning(f"Could not update pipeline session: {e}")
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
t0 = _time.time()
try:
from grid_editor_api import _build_grid_core
session_data = {
"word_result": word_result,
}
grid_result = await _build_grid_core(
pipeline_session_id, session_data,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
f"({_time.time() - t0:.1f}s)")
# Save grid result to pipeline session
try:
await update_pipeline_session_db(
pipeline_session_id,
grid_editor_result=grid_result,
current_step=11,
)
except Exception:
pass
except Exception as e:
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
grid_result = None
# 9. Extract vocab entries from grid result (zones → cells → vocab)
page_vocabulary = []
if grid_result and grid_result.get("zones"):
# Extract from the improved zone-based grid
for zone in grid_result["zones"]:
zone_cols = zone.get("columns", [])
zone_cells = zone.get("cells", [])
if not zone_cols or not zone_cells:
continue
# Build col_index → col_type map
col_type_map = {}
for col in zone_cols:
ci = col.get("col_index", col.get("index", -1))
col_type_map[ci] = col.get("type", col.get("col_type", ""))
# Group cells by row
rows_map = {}
for cell in zone_cells:
ri = cell.get("row_index", 0)
if ri not in rows_map:
rows_map[ri] = {}
ci = cell.get("col_index", 0)
rows_map[ri][ci] = cell
for ri in sorted(rows_map.keys()):
row_cells = rows_map[ri]
en = ""
de = ""
ex = ""
for ci, cell in row_cells.items():
ct = col_type_map.get(ci, "")
text = (cell.get("text") or "").strip()
if not text:
continue
if "en" in ct:
en = text
elif "de" in ct:
de = text
elif "example" in ct or "text" in ct:
ex = text if not ex else ex + " " + text
if en or de:
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": en,
"german": de,
"example_sentence": ex,
"source_page": page_number + 1,
})
else:
# Fallback: use basic cells → vocab entries
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation="british")
for entry in entries:
if not entry.get("english") and not entry.get("german"):
continue
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": entry.get("english", ""),
"german": entry.get("german", ""),
"example_sentence": entry.get("example", ""),
"source_page": page_number + 1,
})
total_duration = _time.time() - t_total
logger.info(f"OCR Pipeline page {page_number + 1}: "
logger.info(f"Kombi Pipeline page {page_number + 1}: "
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
return page_vocabulary, rotation

View File

@@ -156,6 +156,8 @@ export default function VocabWorksheetPage() {
const [includeSolutions, setIncludeSolutions] = useState(true)
const [lineHeight, setLineHeight] = useState('normal')
const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard')
const [showIpa, setShowIpa] = useState(false)
const [showSyllables, setShowSyllables] = useState(false)
// Export state
const [worksheetId, setWorksheetId] = useState<string | null>(null)
@@ -431,7 +433,9 @@ export default function VocabWorksheetPage() {
const API_BASE = getApiBase()
try {
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}`, {
const ipaParam = showIpa ? 'auto' : 'none'
const syllableParam = showSyllables ? 'auto' : 'none'
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}?ipa_mode=${ipaParam}&syllable_mode=${syllableParam}`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ ocr_prompts: ocrPrompts }),
@@ -1907,6 +1911,27 @@ export default function VocabWorksheetPage() {
)}
</div>
{/* OCR display options */}
<div className={`p-4 rounded-xl border ${isDark ? 'bg-white/5 border-white/10' : 'bg-gray-50 border-gray-200'} space-y-3`}>
<h4 className={`text-sm font-medium ${isDark ? 'text-white/70' : 'text-slate-600'}`}>Anzeigeoptionen</h4>
<div className="flex flex-col gap-2">
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
<input type="checkbox" checked={showIpa} onChange={(e) => setShowIpa(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
<div>
<span>Lautschrift (IPA) anzeigen</span>
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. achieve [əˈtʃiːv]</p>
</div>
</label>
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
<input type="checkbox" checked={showSyllables} onChange={(e) => setShowSyllables(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
<div>
<span>Silbentrennung anzeigen</span>
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. Schmet|ter|ling</p>
</div>
</label>
</div>
</div>
<button
onClick={generateWorksheet}
disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating}