Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1325,10 +1325,11 @@ async def process_single_page(
|
||||
|
||||
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||
rotation_deg = 0
|
||||
quality_report = None
|
||||
if OCR_PIPELINE_AVAILABLE:
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
|
||||
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
|
||||
img_bgr, page_number, session_id,
|
||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||
)
|
||||
@@ -1383,7 +1384,7 @@ async def process_single_page(
|
||||
session["vocabulary_count"] = len(existing_vocab)
|
||||
session["status"] = SessionStatus.EXTRACTED.value
|
||||
|
||||
return {
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": True,
|
||||
@@ -1394,6 +1395,14 @@ async def process_single_page(
|
||||
"rotation": rotation_deg,
|
||||
}
|
||||
|
||||
# Add scan quality report if available
|
||||
if quality_report:
|
||||
result["scan_quality"] = quality_report.to_dict()
|
||||
else:
|
||||
quality_report = None # ensure variable exists for non-pipeline path
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def _run_ocr_pipeline_for_page(
|
||||
img_bgr: np.ndarray,
|
||||
@@ -1471,6 +1480,26 @@ async def _run_ocr_pipeline_for_page(
|
||||
except Exception as e:
|
||||
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
|
||||
|
||||
# 5b. Scan quality assessment
|
||||
scan_quality_report = None
|
||||
try:
|
||||
from scan_quality import score_scan_quality
|
||||
scan_quality_report = score_scan_quality(dewarped_bgr)
|
||||
except Exception as e:
|
||||
logger.warning(f" scan quality: failed ({e})")
|
||||
|
||||
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
|
||||
|
||||
# 5c. Image enhancement for degraded scans
|
||||
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
|
||||
if is_degraded:
|
||||
try:
|
||||
from ocr_image_enhance import enhance_for_ocr
|
||||
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
|
||||
logger.info(" enhancement: applied (degraded scan)")
|
||||
except Exception as e:
|
||||
logger.warning(f" enhancement: failed ({e})")
|
||||
|
||||
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
|
||||
t0 = _time.time()
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
@@ -1498,7 +1527,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
text = str(data["text"][i]).strip()
|
||||
conf_raw = str(data["conf"][i])
|
||||
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||
if not text or conf < 20:
|
||||
if not text or conf < min_ocr_conf:
|
||||
continue
|
||||
tess_words.append({
|
||||
"text": text,
|
||||
@@ -1518,8 +1547,8 @@ async def _run_ocr_pipeline_for_page(
|
||||
else:
|
||||
merged_words = tess_words # fallback to Tesseract only
|
||||
|
||||
# Build initial grid from merged words
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||
# Build initial grid from merged words (limit to 3 columns for vocab tables)
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=3)
|
||||
for cell in cells:
|
||||
cell["ocr_engine"] = "rapid_kombi"
|
||||
|
||||
@@ -1743,7 +1772,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
||||
|
||||
return page_vocabulary, rotation
|
||||
return page_vocabulary, rotation, scan_quality_report
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/process-pages")
|
||||
|
||||
Reference in New Issue
Block a user