Replace old OCR pipeline with Kombi pipeline + add IPA/syllable toggles
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 37s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 32s
CI / test-nodejs-website (push) Successful in 33s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 41s
CI / test-go-edu-search (push) Successful in 37s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 32s
CI / test-nodejs-website (push) Successful in 33s
Backend: - _run_ocr_pipeline_for_page() now runs the full Kombi pipeline: orientation → deskew → dewarp → content crop → dual-engine OCR (RapidOCR + Tesseract merge) → _build_grid_core() with pipe-autocorrect, word-gap merge, dictionary detection - Accepts ipa_mode and syllable_mode query params on process-single-page - Pipeline sessions are visible in admin OCR Kombi UI for debugging Frontend (vocab-worksheet): - New "Anzeigeoptionen" section with IPA and syllable toggles - Settings are passed to process-single-page as query parameters Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1283,12 +1283,18 @@ async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Qu
|
|||||||
async def process_single_page(
|
async def process_single_page(
|
||||||
session_id: str,
|
session_id: str,
|
||||||
page_number: int,
|
page_number: int,
|
||||||
|
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||||
|
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Process a SINGLE page of an uploaded PDF using the OCR pipeline.
|
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
|
||||||
|
|
||||||
Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words)
|
Uses the full Kombi pipeline (orientation → deskew → dewarp → crop →
|
||||||
instead of LLM vision for much better extraction quality.
|
dual-engine OCR → grid-build with autocorrect/merge) for best quality.
|
||||||
|
|
||||||
|
Query params:
|
||||||
|
ipa_mode: "none" (default), "auto", "all", "en", "de"
|
||||||
|
syllable_mode: "none" (default), "auto", "all", "en", "de"
|
||||||
|
|
||||||
The frontend should call this sequentially for each page.
|
The frontend should call this sequentially for each page.
|
||||||
Returns the vocabulary for just this one page.
|
Returns the vocabulary for just this one page.
|
||||||
@@ -1316,6 +1322,7 @@ async def process_single_page(
|
|||||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||||
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
|
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
|
||||||
img_bgr, page_number, session_id,
|
img_bgr, page_number, session_id,
|
||||||
|
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
||||||
@@ -1384,28 +1391,33 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
img_bgr: np.ndarray,
|
img_bgr: np.ndarray,
|
||||||
page_number: int,
|
page_number: int,
|
||||||
vocab_session_id: str,
|
vocab_session_id: str,
|
||||||
|
*,
|
||||||
|
ipa_mode: str = "none",
|
||||||
|
syllable_mode: str = "none",
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
"""Run the full OCR pipeline on a single page image and return vocab entries.
|
"""Run the full Kombi OCR pipeline on a single page and return vocab entries.
|
||||||
|
|
||||||
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
|
Uses the same pipeline as the admin OCR Kombi pipeline:
|
||||||
|
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
|
||||||
|
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
|
img_bgr: BGR numpy array.
|
||||||
page_number: 0-indexed page number.
|
page_number: 0-indexed page number.
|
||||||
vocab_session_id: Vocab session ID for logging.
|
vocab_session_id: Vocab session ID for logging.
|
||||||
|
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
||||||
|
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
||||||
|
|
||||||
Steps: deskew → dewarp → columns → rows → words → (LLM review)
|
|
||||||
Returns (entries, rotation_deg) where entries is a list of dicts and
|
Returns (entries, rotation_deg) where entries is a list of dicts and
|
||||||
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
||||||
"""
|
"""
|
||||||
import time as _time
|
import time as _time
|
||||||
|
|
||||||
t_total = _time.time()
|
t_total = _time.time()
|
||||||
|
|
||||||
img_h, img_w = img_bgr.shape[:2]
|
img_h, img_w = img_bgr.shape[:2]
|
||||||
logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
||||||
|
|
||||||
# 1b. Orientation detection (fix upside-down scans)
|
# 1. Orientation detection (fix upside-down scans)
|
||||||
t0 = _time.time()
|
t0 = _time.time()
|
||||||
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
|
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
|
||||||
if rotation:
|
if rotation:
|
||||||
@@ -1414,7 +1426,7 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
else:
|
else:
|
||||||
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
|
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
# 2. Create pipeline session in DB (for debugging in admin UI)
|
# 2. Create pipeline session in DB (visible in admin Kombi UI)
|
||||||
pipeline_session_id = str(uuid.uuid4())
|
pipeline_session_id = str(uuid.uuid4())
|
||||||
try:
|
try:
|
||||||
_, png_buf = cv2.imencode(".png", img_bgr)
|
_, png_buf = cv2.imencode(".png", img_bgr)
|
||||||
@@ -1428,155 +1440,216 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not create pipeline session in DB: {e}")
|
logger.warning(f"Could not create pipeline session in DB: {e}")
|
||||||
|
|
||||||
# 3. Three-pass deskew: iterative + word-alignment + text-line regression
|
# 3. Three-pass deskew
|
||||||
t0 = _time.time()
|
t0 = _time.time()
|
||||||
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
|
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
|
||||||
angle_pass1 = deskew_debug.get("pass1_angle", 0.0)
|
logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
|
||||||
angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
|
|
||||||
angle_pass3 = deskew_debug.get("pass3_angle", 0.0)
|
|
||||||
|
|
||||||
logger.info(f" deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
|
|
||||||
f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
|
|
||||||
f"({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# 4. Dewarp
|
# 4. Dewarp
|
||||||
t0 = _time.time()
|
t0 = _time.time()
|
||||||
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
|
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
|
||||||
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
|
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
# 5. Column detection
|
# 5. Content crop (removes scanner borders, gutter shadows)
|
||||||
t0 = _time.time()
|
t0 = _time.time()
|
||||||
ocr_img = create_ocr_image(dewarped_bgr)
|
|
||||||
h, w = ocr_img.shape[:2]
|
|
||||||
|
|
||||||
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
|
|
||||||
if geo_result is None:
|
|
||||||
layout_img = create_layout_image(dewarped_bgr)
|
|
||||||
regions = analyze_layout(layout_img, ocr_img)
|
|
||||||
word_dicts = None
|
|
||||||
inv = None
|
|
||||||
content_bounds = None
|
|
||||||
else:
|
|
||||||
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
|
||||||
content_w = right_x - left_x
|
|
||||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
|
|
||||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
|
||||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
|
||||||
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
|
||||||
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
|
||||||
content_h = bottom_y - top_y
|
|
||||||
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
|
||||||
content_bounds = (left_x, right_x, top_y, bottom_y)
|
|
||||||
|
|
||||||
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# 6. Row detection
|
|
||||||
t0 = _time.time()
|
|
||||||
if word_dicts is None or inv is None or content_bounds is None:
|
|
||||||
# Re-run geometry detection to get intermediates
|
|
||||||
geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
|
|
||||||
if geo_result2 is None:
|
|
||||||
raise ValueError("Column geometry detection failed — cannot detect rows")
|
|
||||||
_, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
|
|
||||||
content_bounds = (left_x, right_x, top_y, bottom_y)
|
|
||||||
|
|
||||||
left_x, right_x, top_y, bottom_y = content_bounds
|
|
||||||
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
|
||||||
logger.info(f" rows: {len(rows)} detected ({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
# 7. Word recognition (cell-first OCR v2)
|
|
||||||
t0 = _time.time()
|
|
||||||
col_regions = regions # already PageRegion objects
|
|
||||||
|
|
||||||
# Populate row.words for word_count filtering
|
|
||||||
for row in rows:
|
|
||||||
row_y_rel = row.y - top_y
|
|
||||||
row_bottom_rel = row_y_rel + row.height
|
|
||||||
row.words = [
|
|
||||||
wd for wd in word_dicts
|
|
||||||
if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
|
|
||||||
]
|
|
||||||
row.word_count = len(row.words)
|
|
||||||
|
|
||||||
cells, columns_meta = build_cell_grid_v2(
|
|
||||||
ocr_img, col_regions, rows, img_w, img_h,
|
|
||||||
ocr_engine="auto", img_bgr=dewarped_bgr,
|
|
||||||
)
|
|
||||||
|
|
||||||
col_types = {c['type'] for c in columns_meta}
|
|
||||||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
|
||||||
logger.info(f" words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)")
|
|
||||||
|
|
||||||
if not is_vocab:
|
|
||||||
logger.warning(f" Page {page_number + 1}: layout is not vocab table "
|
|
||||||
f"(types: {col_types}), returning empty")
|
|
||||||
return [], rotation
|
|
||||||
|
|
||||||
# 8. Map cells → vocab entries
|
|
||||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
||||||
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
|
||||||
|
|
||||||
# 9. Optional LLM review
|
|
||||||
try:
|
try:
|
||||||
review_result = await llm_review_entries(entries)
|
from page_crop import detect_and_crop_page
|
||||||
if review_result and review_result.get("changes"):
|
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
|
||||||
# Apply corrections
|
if crop_result.get("crop_applied"):
|
||||||
changes_map = {}
|
dewarped_bgr = cropped_bgr
|
||||||
for ch in review_result["changes"]:
|
logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
|
||||||
idx = ch.get("index")
|
else:
|
||||||
if idx is not None:
|
logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
|
||||||
changes_map[idx] = ch
|
|
||||||
for idx, ch in changes_map.items():
|
|
||||||
if 0 <= idx < len(entries):
|
|
||||||
for field in ("english", "german", "example"):
|
|
||||||
if ch.get(field) and ch[field] != entries[idx].get(field):
|
|
||||||
entries[idx][field] = ch[field]
|
|
||||||
logger.info(f" llm review: {len(review_result['changes'])} corrections applied")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f" llm review skipped: {e}")
|
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
|
||||||
|
|
||||||
# 10. Map to frontend format
|
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
|
||||||
page_vocabulary = []
|
t0 = _time.time()
|
||||||
for entry in entries:
|
img_h, img_w = dewarped_bgr.shape[:2]
|
||||||
if not entry.get("english") and not entry.get("german"):
|
|
||||||
continue # skip empty rows
|
# RapidOCR (local ONNX)
|
||||||
page_vocabulary.append({
|
try:
|
||||||
"id": str(uuid.uuid4()),
|
from cv_ocr_engines import ocr_region_rapid
|
||||||
"english": entry.get("english", ""),
|
from cv_vocab_types import PageRegion
|
||||||
"german": entry.get("german", ""),
|
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
|
||||||
"example_sentence": entry.get("example", ""),
|
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
|
||||||
"source_page": page_number + 1,
|
except Exception as e:
|
||||||
|
logger.warning(f" RapidOCR failed: {e}")
|
||||||
|
rapid_words = []
|
||||||
|
|
||||||
|
# Tesseract
|
||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
|
||||||
|
data = pytesseract.image_to_data(
|
||||||
|
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
|
||||||
|
output_type=pytesseract.Output.DICT,
|
||||||
|
)
|
||||||
|
tess_words = []
|
||||||
|
for i in range(len(data["text"])):
|
||||||
|
text = str(data["text"][i]).strip()
|
||||||
|
conf_raw = str(data["conf"][i])
|
||||||
|
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||||
|
if not text or conf < 20:
|
||||||
|
continue
|
||||||
|
tess_words.append({
|
||||||
|
"text": text,
|
||||||
|
"left": data["left"][i], "top": data["top"][i],
|
||||||
|
"width": data["width"][i], "height": data["height"][i],
|
||||||
|
"conf": conf,
|
||||||
})
|
})
|
||||||
|
|
||||||
# 11. Update pipeline session in DB (for admin debugging)
|
# Merge dual-engine results
|
||||||
try:
|
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
|
||||||
success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr)
|
from cv_words_first import build_grid_from_words
|
||||||
deskewed_png = dsk_buf.tobytes() if success_dsk else None
|
|
||||||
success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
|
|
||||||
dewarped_png = dwp_buf.tobytes() if success_dwp else None
|
|
||||||
|
|
||||||
|
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
|
||||||
|
if rapid_split or tess_words:
|
||||||
|
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
|
||||||
|
merged_words = _deduplicate_words(merged_words)
|
||||||
|
else:
|
||||||
|
merged_words = tess_words # fallback to Tesseract only
|
||||||
|
|
||||||
|
# Build initial grid from merged words
|
||||||
|
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||||
|
for cell in cells:
|
||||||
|
cell["ocr_engine"] = "rapid_kombi"
|
||||||
|
|
||||||
|
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
||||||
|
n_cols = len(columns_meta)
|
||||||
|
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
|
||||||
|
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# 7. Save word_result to pipeline session (needed by _build_grid_core)
|
||||||
|
word_result = {
|
||||||
|
"cells": cells,
|
||||||
|
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
||||||
|
"columns_used": columns_meta,
|
||||||
|
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
|
||||||
|
"image_width": img_w,
|
||||||
|
"image_height": img_h,
|
||||||
|
"duration_seconds": 0,
|
||||||
|
"ocr_engine": "rapid_kombi",
|
||||||
|
"raw_tesseract_words": tess_words,
|
||||||
|
"summary": {
|
||||||
|
"total_cells": len(cells),
|
||||||
|
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Save images + word_result to pipeline session for admin visibility
|
||||||
|
try:
|
||||||
|
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
|
||||||
|
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
|
||||||
await update_pipeline_session_db(
|
await update_pipeline_session_db(
|
||||||
pipeline_session_id,
|
pipeline_session_id,
|
||||||
deskewed_png=deskewed_png,
|
deskewed_png=dsk_buf.tobytes(),
|
||||||
dewarped_png=dewarped_png,
|
dewarped_png=dwp_buf.tobytes(),
|
||||||
|
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
|
||||||
|
word_result=word_result,
|
||||||
deskew_result={"angle_applied": round(angle_applied, 3)},
|
deskew_result={"angle_applied": round(angle_applied, 3)},
|
||||||
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
|
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
|
||||||
column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y,
|
current_step=8,
|
||||||
"width": r.width, "height": r.height}
|
|
||||||
for r in col_regions]},
|
|
||||||
row_result={"total_rows": len(rows)},
|
|
||||||
word_result={
|
|
||||||
"entry_count": len(page_vocabulary),
|
|
||||||
"layout": "vocab",
|
|
||||||
"vocab_entries": entries,
|
|
||||||
},
|
|
||||||
current_step=6,
|
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not update pipeline session: {e}")
|
logger.warning(f"Could not update pipeline session: {e}")
|
||||||
|
|
||||||
|
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
|
||||||
|
t0 = _time.time()
|
||||||
|
try:
|
||||||
|
from grid_editor_api import _build_grid_core
|
||||||
|
session_data = {
|
||||||
|
"word_result": word_result,
|
||||||
|
}
|
||||||
|
grid_result = await _build_grid_core(
|
||||||
|
pipeline_session_id, session_data,
|
||||||
|
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||||
|
)
|
||||||
|
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
|
||||||
|
f"({_time.time() - t0:.1f}s)")
|
||||||
|
|
||||||
|
# Save grid result to pipeline session
|
||||||
|
try:
|
||||||
|
await update_pipeline_session_db(
|
||||||
|
pipeline_session_id,
|
||||||
|
grid_editor_result=grid_result,
|
||||||
|
current_step=11,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
||||||
|
grid_result = None
|
||||||
|
|
||||||
|
# 9. Extract vocab entries from grid result (zones → cells → vocab)
|
||||||
|
page_vocabulary = []
|
||||||
|
|
||||||
|
if grid_result and grid_result.get("zones"):
|
||||||
|
# Extract from the improved zone-based grid
|
||||||
|
for zone in grid_result["zones"]:
|
||||||
|
zone_cols = zone.get("columns", [])
|
||||||
|
zone_cells = zone.get("cells", [])
|
||||||
|
if not zone_cols or not zone_cells:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Build col_index → col_type map
|
||||||
|
col_type_map = {}
|
||||||
|
for col in zone_cols:
|
||||||
|
ci = col.get("col_index", col.get("index", -1))
|
||||||
|
col_type_map[ci] = col.get("type", col.get("col_type", ""))
|
||||||
|
|
||||||
|
# Group cells by row
|
||||||
|
rows_map = {}
|
||||||
|
for cell in zone_cells:
|
||||||
|
ri = cell.get("row_index", 0)
|
||||||
|
if ri not in rows_map:
|
||||||
|
rows_map[ri] = {}
|
||||||
|
ci = cell.get("col_index", 0)
|
||||||
|
rows_map[ri][ci] = cell
|
||||||
|
|
||||||
|
for ri in sorted(rows_map.keys()):
|
||||||
|
row_cells = rows_map[ri]
|
||||||
|
en = ""
|
||||||
|
de = ""
|
||||||
|
ex = ""
|
||||||
|
for ci, cell in row_cells.items():
|
||||||
|
ct = col_type_map.get(ci, "")
|
||||||
|
text = (cell.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
if "en" in ct:
|
||||||
|
en = text
|
||||||
|
elif "de" in ct:
|
||||||
|
de = text
|
||||||
|
elif "example" in ct or "text" in ct:
|
||||||
|
ex = text if not ex else ex + " " + text
|
||||||
|
|
||||||
|
if en or de:
|
||||||
|
page_vocabulary.append({
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": en,
|
||||||
|
"german": de,
|
||||||
|
"example_sentence": ex,
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# Fallback: use basic cells → vocab entries
|
||||||
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||||
|
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
||||||
|
for entry in entries:
|
||||||
|
if not entry.get("english") and not entry.get("german"):
|
||||||
|
continue
|
||||||
|
page_vocabulary.append({
|
||||||
|
"id": str(uuid.uuid4()),
|
||||||
|
"english": entry.get("english", ""),
|
||||||
|
"german": entry.get("german", ""),
|
||||||
|
"example_sentence": entry.get("example", ""),
|
||||||
|
"source_page": page_number + 1,
|
||||||
|
})
|
||||||
|
|
||||||
total_duration = _time.time() - t_total
|
total_duration = _time.time() - t_total
|
||||||
logger.info(f"OCR Pipeline page {page_number + 1}: "
|
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||||
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
||||||
|
|
||||||
return page_vocabulary, rotation
|
return page_vocabulary, rotation
|
||||||
|
|||||||
@@ -156,6 +156,8 @@ export default function VocabWorksheetPage() {
|
|||||||
const [includeSolutions, setIncludeSolutions] = useState(true)
|
const [includeSolutions, setIncludeSolutions] = useState(true)
|
||||||
const [lineHeight, setLineHeight] = useState('normal')
|
const [lineHeight, setLineHeight] = useState('normal')
|
||||||
const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard')
|
const [selectedFormat, setSelectedFormat] = useState<WorksheetFormat>('standard')
|
||||||
|
const [showIpa, setShowIpa] = useState(false)
|
||||||
|
const [showSyllables, setShowSyllables] = useState(false)
|
||||||
|
|
||||||
// Export state
|
// Export state
|
||||||
const [worksheetId, setWorksheetId] = useState<string | null>(null)
|
const [worksheetId, setWorksheetId] = useState<string | null>(null)
|
||||||
@@ -431,7 +433,9 @@ export default function VocabWorksheetPage() {
|
|||||||
const API_BASE = getApiBase()
|
const API_BASE = getApiBase()
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}`, {
|
const ipaParam = showIpa ? 'auto' : 'none'
|
||||||
|
const syllableParam = showSyllables ? 'auto' : 'none'
|
||||||
|
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session!.id}/process-single-page/${pageIndex}?ipa_mode=${ipaParam}&syllable_mode=${syllableParam}`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ ocr_prompts: ocrPrompts }),
|
body: JSON.stringify({ ocr_prompts: ocrPrompts }),
|
||||||
@@ -1907,6 +1911,27 @@ export default function VocabWorksheetPage() {
|
|||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* OCR display options */}
|
||||||
|
<div className={`p-4 rounded-xl border ${isDark ? 'bg-white/5 border-white/10' : 'bg-gray-50 border-gray-200'} space-y-3`}>
|
||||||
|
<h4 className={`text-sm font-medium ${isDark ? 'text-white/70' : 'text-slate-600'}`}>Anzeigeoptionen</h4>
|
||||||
|
<div className="flex flex-col gap-2">
|
||||||
|
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
|
||||||
|
<input type="checkbox" checked={showIpa} onChange={(e) => setShowIpa(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
|
||||||
|
<div>
|
||||||
|
<span>Lautschrift (IPA) anzeigen</span>
|
||||||
|
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. achieve [əˈtʃiːv]</p>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
<label className={`flex items-center gap-3 cursor-pointer ${isDark ? 'text-white' : 'text-slate-900'}`}>
|
||||||
|
<input type="checkbox" checked={showSyllables} onChange={(e) => setShowSyllables(e.target.checked)} className="w-5 h-5 rounded border-2 border-purple-500 text-purple-500 focus:ring-purple-500" />
|
||||||
|
<div>
|
||||||
|
<span>Silbentrennung anzeigen</span>
|
||||||
|
<p className={`text-xs ${isDark ? 'text-white/40' : 'text-slate-400'}`}>z.B. Schmet|ter|ling</p>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<button
|
<button
|
||||||
onClick={generateWorksheet}
|
onClick={generateWorksheet}
|
||||||
disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating}
|
disabled={(selectedFormat === 'standard' && selectedTypes.length === 0) || isGenerating}
|
||||||
|
|||||||
Reference in New Issue
Block a user