Replace old OCR pipeline with Kombi pipeline + add IPA/syllable toggles
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 37s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 32s
CI / test-nodejs-website (push) Successful in 33s

Backend:
- _run_ocr_pipeline_for_page() now runs the full Kombi pipeline:
  orientation → deskew → dewarp → content crop → dual-engine OCR
  (RapidOCR + Tesseract merge) → _build_grid_core() with pipe-autocorrect,
  word-gap merge, dictionary detection
- Accepts ipa_mode and syllable_mode query params on process-single-page
- Pipeline sessions are visible in admin OCR Kombi UI for debugging

Frontend (vocab-worksheet):
- New "Anzeigeoptionen" section with IPA and syllable toggles
- Settings are passed to process-single-page as query parameters

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-11 00:43:42 +02:00
parent 2828871e42
commit 3b78baf37f
2 changed files with 235 additions and 137 deletions

View File

@@ -1283,12 +1283,18 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
async def process_single_page( async def process_single_page(
session_id: str, session_id: str,
page_number: int, page_number: int,
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
): ):
""" """
Process a SINGLE page of an uploaded PDF using the OCR pipeline. Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words) Uses the full Kombi pipeline (orientation → deskew → dewarp → crop →
instead of LLM vision for much better extraction quality. dual-engine OCR → grid-build with autocorrect/merge) for best quality.
Query params:
ipa_mode: "none" (default), "auto", "all", "en", "de"
syllable_mode: "none" (default), "auto", "all", "en", "de"
The frontend should call this sequentially for each page. The frontend should call this sequentially for each page.
Returns the vocabulary for just this one page. Returns the vocabulary for just this one page.
@@ -1316,6 +1322,7 @@ async def process_single_page(
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page( page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id, img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
) )
except Exception as e: except Exception as e:
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True) logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
@@ -1384,28 +1391,33 @@ async def _run_ocr_pipeline_for_page(
img_bgr: np.ndarray, img_bgr: np.ndarray,
page_number: int, page_number: int,
vocab_session_id: str, vocab_session_id: str,
*,
ipa_mode: str = "none",
syllable_mode: str = "none",
) -> tuple: ) -> tuple:
"""Run the full OCR pipeline on a single page image and return vocab entries. """Run the full Kombi OCR pipeline on a single page and return vocab entries.
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py). Uses the same pipeline as the admin OCR Kombi pipeline:
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
Args: Args:
img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline). img_bgr: BGR numpy array.
page_number: 0-indexed page number. page_number: 0-indexed page number.
vocab_session_id: Vocab session ID for logging. vocab_session_id: Vocab session ID for logging.
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
Steps: deskew → dewarp → columns → rows → words → (LLM review)
Returns (entries, rotation_deg) where entries is a list of dicts and Returns (entries, rotation_deg) where entries is a list of dicts and
rotation_deg is the orientation correction applied (0, 90, 180, 270). rotation_deg is the orientation correction applied (0, 90, 180, 270).
""" """
import time as _time import time as _time
t_total = _time.time() t_total = _time.time()
img_h, img_w = img_bgr.shape[:2] img_h, img_w = img_bgr.shape[:2]
logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}") logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
# 1b. Orientation detection (fix upside-down scans) # 1. Orientation detection (fix upside-down scans)
t0 = _time.time() t0 = _time.time()
img_bgr, rotation = detect_and_fix_orientation(img_bgr) img_bgr, rotation = detect_and_fix_orientation(img_bgr)
if rotation: if rotation:
@@ -1414,7 +1426,7 @@ async def _run_ocr_pipeline_for_page(
else: else:
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)") logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
# 2. Create pipeline session in DB (for debugging in admin UI) # 2. Create pipeline session in DB (visible in admin Kombi UI)
pipeline_session_id = str(uuid.uuid4()) pipeline_session_id = str(uuid.uuid4())
try: try:
_, png_buf = cv2.imencode(".png", img_bgr) _, png_buf = cv2.imencode(".png", img_bgr)
@@ -1428,155 +1440,216 @@ async def _run_ocr_pipeline_for_page(
except Exception as e: except Exception as e:
logger.warning(f"Could not create pipeline session in DB: {e}") logger.warning(f"Could not create pipeline session in DB: {e}")
# 3. Three-pass deskew: iterative + word-alignment + text-line regression # 3. Three-pass deskew
t0 = _time.time() t0 = _time.time()
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy()) deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
angle_pass1 = deskew_debug.get("pass1_angle", 0.0) logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
angle_pass3 = deskew_debug.get("pass3_angle", 0.0)
logger.info(f" deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
f"({_time.time() - t0:.1f}s)")
# 4. Dewarp # 4. Dewarp
t0 = _time.time() t0 = _time.time()
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr) dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)") logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
# 5. Column detection # 5. Content crop (removes scanner borders, gutter shadows)
t0 = _time.time() t0 = _time.time()
ocr_img = create_ocr_image(dewarped_bgr)
h, w = ocr_img.shape[:2]
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result is None:
layout_img = create_layout_image(dewarped_bgr)
regions = analyze_layout(layout_img, ocr_img)
word_dicts = None
inv = None
content_bounds = None
else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
content_w = right_x - left_x
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
content_h = bottom_y - top_y
regions = positional_column_regions(geometries, content_w, content_h, left_x)
content_bounds = (left_x, right_x, top_y, bottom_y)
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
# 6. Row detection
t0 = _time.time()
if word_dicts is None or inv is None or content_bounds is None:
# Re-run geometry detection to get intermediates
geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result2 is None:
raise ValueError("Column geometry detection failed — cannot detect rows")
_, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
content_bounds = (left_x, right_x, top_y, bottom_y)
left_x, right_x, top_y, bottom_y = content_bounds
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
logger.info(f" rows: {len(rows)} detected ({_time.time() - t0:.1f}s)")
# 7. Word recognition (cell-first OCR v2)
t0 = _time.time()
col_regions = regions # already PageRegion objects
# Populate row.words for word_count filtering
for row in rows:
row_y_rel = row.y - top_y
row_bottom_rel = row_y_rel + row.height
row.words = [
wd for wd in word_dicts
if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
]
row.word_count = len(row.words)
cells, columns_meta = build_cell_grid_v2(
ocr_img, col_regions, rows, img_w, img_h,
ocr_engine="auto", img_bgr=dewarped_bgr,
)
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
logger.info(f" words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)")
if not is_vocab:
logger.warning(f" Page {page_number + 1}: layout is not vocab table "
f"(types: {col_types}), returning empty")
return [], rotation
# 8. Map cells → vocab entries
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation="british")
# 9. Optional LLM review
try: try:
review_result = await llm_review_entries(entries) from page_crop import detect_and_crop_page
if review_result and review_result.get("changes"): cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
# Apply corrections if crop_result.get("crop_applied"):
changes_map = {} dewarped_bgr = cropped_bgr
for ch in review_result["changes"]: logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
idx = ch.get("index") else:
if idx is not None: logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
changes_map[idx] = ch
for idx, ch in changes_map.items():
if 0 <= idx < len(entries):
for field in ("english", "german", "example"):
if ch.get(field) and ch[field] != entries[idx].get(field):
entries[idx][field] = ch[field]
logger.info(f" llm review: {len(review_result['changes'])} corrections applied")
except Exception as e: except Exception as e:
logger.warning(f" llm review skipped: {e}") logger.warning(f" crop: failed ({e}), continuing with uncropped image")
# 10. Map to frontend format # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
page_vocabulary = [] t0 = _time.time()
for entry in entries: img_h, img_w = dewarped_bgr.shape[:2]
if not entry.get("english") and not entry.get("german"):
continue # skip empty rows # RapidOCR (local ONNX)
page_vocabulary.append({ try:
"id": str(uuid.uuid4()), from cv_ocr_engines import ocr_region_rapid
"english": entry.get("english", ""), from cv_vocab_types import PageRegion
"german": entry.get("german", ""), full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
"example_sentence": entry.get("example", ""), rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
"source_page": page_number + 1, except Exception as e:
logger.warning(f" RapidOCR failed: {e}")
rapid_words = []
# Tesseract
from PIL import Image
import pytesseract
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
data = pytesseract.image_to_data(
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
output_type=pytesseract.Output.DICT,
)
tess_words = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < 20:
continue
tess_words.append({
"text": text,
"left": data["left"][i], "top": data["top"][i],
"width": data["width"][i], "height": data["height"][i],
"conf": conf,
}) })
# 11. Update pipeline session in DB (for admin debugging) # Merge dual-engine results
try: from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr) from cv_words_first import build_grid_from_words
deskewed_png = dsk_buf.tobytes() if success_dsk else None
success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
dewarped_png = dwp_buf.tobytes() if success_dwp else None
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
if rapid_split or tess_words:
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
merged_words = _deduplicate_words(merged_words)
else:
merged_words = tess_words # fallback to Tesseract only
# Build initial grid from merged words
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
# 7. Save word_result to pipeline session (needed by _build_grid_core)
word_result = {
"cells": cells,
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": 0,
"ocr_engine": "rapid_kombi",
"raw_tesseract_words": tess_words,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
},
}
# Save images + word_result to pipeline session for admin visibility
try:
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
await update_pipeline_session_db( await update_pipeline_session_db(
pipeline_session_id, pipeline_session_id,
deskewed_png=deskewed_png, deskewed_png=dsk_buf.tobytes(),
dewarped_png=dewarped_png, dewarped_png=dwp_buf.tobytes(),
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
word_result=word_result,
deskew_result={"angle_applied": round(angle_applied, 3)}, deskew_result={"angle_applied": round(angle_applied, 3)},
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)}, dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y, current_step=8,
"width": r.width, "height": r.height}
for r in col_regions]},
row_result={"total_rows": len(rows)},
word_result={
"entry_count": len(page_vocabulary),
"layout": "vocab",
"vocab_entries": entries,
},
current_step=6,
) )
except Exception as e: except Exception as e:
logger.warning(f"Could not update pipeline session: {e}") logger.warning(f"Could not update pipeline session: {e}")
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
t0 = _time.time()
try:
from grid_editor_api import _build_grid_core
session_data = {
"word_result": word_result,
}
grid_result = await _build_grid_core(
pipeline_session_id, session_data,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
f"({_time.time() - t0:.1f}s)")
# Save grid result to pipeline session
try:
await update_pipeline_session_db(
pipeline_session_id,
grid_editor_result=grid_result,
current_step=11,
)
except Exception:
pass
except Exception as e:
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
grid_result = None
# 9. Extract vocab entries from grid result (zones → cells → vocab)
page_vocabulary = []
if grid_result and grid_result.get("zones"):
# Extract from the improved zone-based grid
for zone in grid_result["zones"]:
zone_cols = zone.get("columns", [])
zone_cells = zone.get("cells", [])
if not zone_cols or not zone_cells:
continue
# Build col_index → col_type map
col_type_map = {}
for col in zone_cols:
ci = col.get("col_index", col.get("index", -1))
col_type_map[ci] = col.get("type", col.get("col_type", ""))
# Group cells by row
rows_map = {}
for cell in zone_cells:
ri = cell.get("row_index", 0)
if ri not in rows_map:
rows_map[ri] = {}
ci = cell.get("col_index", 0)
rows_map[ri][ci] = cell
for ri in sorted(rows_map.keys()):
row_cells = rows_map[ri]
en = ""
de = ""
ex = ""
for ci, cell in row_cells.items():
ct = col_type_map.get(ci, "")
text = (cell.get("text") or "").strip()
if not text:
continue
if "en" in ct:
en = text
elif "de" in ct:
de = text
elif "example" in ct or "text" in ct:
ex = text if not ex else ex + " " + text
if en or de:
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": en,
"german": de,
"example_sentence": ex,
"source_page": page_number + 1,
})
else:
# Fallback: use basic cells → vocab entries
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation="british")
for entry in entries:
if not entry.get("english") and not entry.get("german"):
continue
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": entry.get("english", ""),
"german": entry.get("german", ""),
"example_sentence": entry.get("example", ""),
"source_page": page_number + 1,
})
total_duration = _time.time() - t_total total_duration = _time.time() - t_total
logger.info(f"OCR Pipeline page {page_number + 1}: " logger.info(f"Kombi Pipeline page {page_number + 1}: "
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s") f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
return page_vocabulary, rotation return page_vocabulary, rotation

View File

@@ -156,6 +156,8 @@ export default function VocabWorksheetPage() {
const [includeSolutions, setIncludeSolutions] = useState(true) const [includeSolutions, setIncludeSolutions] = useState(true)
const [lineHeight, setLineHeight] = useState('normal') const [lineHeight, setLineHeight] = useState('normal')
const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard') const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard')
const [showIpa, setShowIpa] = useState(false)
const [showSyllables, setShowSyllables] = useState(false)
// Export state // Export state
const [worksheetId, setWorksheetId] = useState<string | null>(null) const [worksheetId, setWorksheetId] = useState<string | null>(null)
@@ -431,7 +433,9 @@ export default function VocabWorksheetPage() {
const API_BASE = getApiBase() const API_BASE = getApiBase()
try { try {
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}`, { const ipaParam = showIpa ? 'auto' : 'none'
const syllableParam = showSyllables ? 'auto' : 'none'
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}?ipa_mode=${ipaParam}&syllable_mode=${syllableParam}`, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ ocr_prompts: ocrPrompts }), body: JSON.stringify({ ocr_prompts: ocrPrompts }),
@@ -1907,6 +1911,27 @@ export default function VocabWorksheetPage() {
)} )}
</div> </div>
{/* OCR display options */}
<div className={`p-4 rounded-xl border ${isDark ? 'bg-white/5 border-white/10' : 'bg-gray-50 border-gray-200'} space-y-3`}>
<h4 className={`text-sm font-medium ${isDark ? 'text-white/70' : 'text-slate-600'}`}>Anzeigeoptionen</h4>
<div className="flex flex-col gap-2">
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
<input type="checkbox" checked={showIpa} onChange={(e) => setShowIpa(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
<div>
<span>Lautschrift (IPA) anzeigen</span>
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. achieve [əˈtʃiːv]</p>
</div>
</label>
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
<input type="checkbox" checked={showSyllables} onChange={(e) => setShowSyllables(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
<div>
<span>Silbentrennung anzeigen</span>
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. Schmet|ter|ling</p>
</div>
</label>
</div>
</div>
<button <button
onClick={generateWorksheet} onClick={generateWorksheet}
disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating} disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating}