Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s

Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts
OCR confidence threshold (40 for good scans, 30 for degraded).
Quality report included in API response + shown in frontend.

Step 2: max_columns parameter in cv_words_first.py — limits column
detection to 3 for vocab tables, preventing phantom columns D/E
from degraded OCR fragments.

Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter
denoising + unsharp mask, only for degraded scans (gated by
quality score). Pattern from handwriting_htr_api.py.

Frontend: quality info shown in extraction status after processing.
Reprocess button now derives pages from vocabulary data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-23 14:58:39 +02:00
parent 5a154b744d
commit 2f34ee9ede
5 changed files with 267 additions and 16 deletions

View File

@@ -1325,10 +1325,11 @@ async def process_single_page(
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
rotation_deg = 0
quality_report = None
if OCR_PIPELINE_AVAILABLE:
try:
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
@@ -1383,7 +1384,7 @@ async def process_single_page(
session["vocabulary_count"] = len(existing_vocab)
session["status"] = SessionStatus.EXTRACTED.value
return {
result = {
"session_id": session_id,
"page_number": page_number + 1,
"success": True,
@@ -1394,6 +1395,14 @@ async def process_single_page(
"rotation": rotation_deg,
}
# Add scan quality report if available
if quality_report:
result["scan_quality"] = quality_report.to_dict()
else:
quality_report = None # ensure variable exists for non-pipeline path
return result
async def _run_ocr_pipeline_for_page(
img_bgr: np.ndarray,
@@ -1471,6 +1480,26 @@ async def _run_ocr_pipeline_for_page(
except Exception as e:
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
# 5b. Scan quality assessment
scan_quality_report = None
try:
from scan_quality import score_scan_quality
scan_quality_report = score_scan_quality(dewarped_bgr)
except Exception as e:
logger.warning(f" scan quality: failed ({e})")
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
# 5c. Image enhancement for degraded scans
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
if is_degraded:
try:
from ocr_image_enhance import enhance_for_ocr
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
logger.info(" enhancement: applied (degraded scan)")
except Exception as e:
logger.warning(f" enhancement: failed ({e})")
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
t0 = _time.time()
img_h, img_w = dewarped_bgr.shape[:2]
@@ -1498,7 +1527,7 @@ async def _run_ocr_pipeline_for_page(
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < 20:
if not text or conf < min_ocr_conf:
continue
tess_words.append({
"text": text,
@@ -1518,8 +1547,8 @@ async def _run_ocr_pipeline_for_page(
else:
merged_words = tess_words # fallback to Tesseract only
# Build initial grid from merged words
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
# Build initial grid from merged words (limit to 3 columns for vocab tables)
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=3)
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
@@ -1743,7 +1772,7 @@ async def _run_ocr_pipeline_for_page(
logger.info(f"Kombi Pipeline page {page_number + 1}: "
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
return page_vocabulary, rotation
return page_vocabulary, rotation, scan_quality_report
@router.post("/sessions/{session_id}/process-pages")