Add scan quality scoring, column limit, image enhancement (Steps 1-3)

Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 14:58:39 +02:00
parent 5a154b744d
commit 2f34ee9ede
5 changed files with 267 additions and 16 deletions
--- a/klausur-service/backend/vocab_worksheet_api.py
+++ b/klausur-service/backend/vocab_worksheet_api.py
@@ -1325,10 +1325,11 @@ async def process_single_page(

    # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
    rotation_deg = 0
+    quality_report = None
    if OCR_PIPELINE_AVAILABLE:
        try:
            img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
-            page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
+            page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
                img_bgr, page_number, session_id,
                ipa_mode=ipa_mode, syllable_mode=syllable_mode,
            )
@@ -1383,7 +1384,7 @@ async def process_single_page(
    session["vocabulary_count"] = len(existing_vocab)
    session["status"] = SessionStatus.EXTRACTED.value

-    return {
+    result = {
        "session_id": session_id,
        "page_number": page_number + 1,
        "success": True,
@@ -1394,6 +1395,14 @@ async def process_single_page(
        "rotation": rotation_deg,
    }

+    # Add scan quality report if available
+    if quality_report:
+        result["scan_quality"] = quality_report.to_dict()
+    else:
+        quality_report = None  # ensure variable exists for non-pipeline path
+
+    return result
+

 async def _run_ocr_pipeline_for_page(
    img_bgr: np.ndarray,
@@ -1471,6 +1480,26 @@ async def _run_ocr_pipeline_for_page(
    except Exception as e:
        logger.warning(f"  crop: failed ({e}), continuing with uncropped image")

+    # 5b. Scan quality assessment
+    scan_quality_report = None
+    try:
+        from scan_quality import score_scan_quality
+        scan_quality_report = score_scan_quality(dewarped_bgr)
+    except Exception as e:
+        logger.warning(f"  scan quality: failed ({e})")
+
+    min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
+
+    # 5c. Image enhancement for degraded scans
+    is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
+    if is_degraded:
+        try:
+            from ocr_image_enhance import enhance_for_ocr
+            dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
+            logger.info("  enhancement: applied (degraded scan)")
+        except Exception as e:
+            logger.warning(f"  enhancement: failed ({e})")
+
    # 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
    t0 = _time.time()
    img_h, img_w = dewarped_bgr.shape[:2]
@@ -1498,7 +1527,7 @@ async def _run_ocr_pipeline_for_page(
        text = str(data["text"][i]).strip()
        conf_raw = str(data["conf"][i])
        conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
-        if not text or conf < 20:
+        if not text or conf < min_ocr_conf:
            continue
        tess_words.append({
            "text": text,
@@ -1518,8 +1547,8 @@ async def _run_ocr_pipeline_for_page(
    else:
        merged_words = tess_words  # fallback to Tesseract only

-    # Build initial grid from merged words
-    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
+    # Build initial grid from merged words (limit to 3 columns for vocab tables)
+    cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=3)
    for cell in cells:
        cell["ocr_engine"] = "rapid_kombi"

@@ -1743,7 +1772,7 @@ async def _run_ocr_pipeline_for_page(
    logger.info(f"Kombi Pipeline page {page_number + 1}: "
                f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")

-    return page_vocabulary, rotation
+    return page_vocabulary, rotation, scan_quality_report


@router.post("/sessions/{session_id}/process-pages")