Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
103 lines
3.3 KiB
Python
103 lines
3.3 KiB
Python
"""
|
|
Scan Quality Assessment — Measures image quality before OCR.
|
|
|
|
Computes blur score, contrast score, and an overall quality rating.
|
|
Used to gate enhancement steps and warn users about degraded scans.
|
|
|
|
All operations use OpenCV (Apache-2.0), no additional dependencies.
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass, asdict
|
|
from typing import Dict, Any
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Thresholds (empirically tuned on textbook scans)
|
|
BLUR_THRESHOLD = 100.0 # Laplacian variance below this = blurry
|
|
CONTRAST_THRESHOLD = 40.0 # Grayscale stddev below this = low contrast
|
|
CONFIDENCE_GOOD = 40 # OCR min confidence for good scans
|
|
CONFIDENCE_DEGRADED = 30 # OCR min confidence for degraded scans
|
|
|
|
|
|
@dataclass
|
|
class ScanQualityReport:
|
|
"""Result of scan quality assessment."""
|
|
blur_score: float # Laplacian variance (higher = sharper)
|
|
contrast_score: float # Grayscale std deviation (higher = more contrast)
|
|
brightness: float # Mean grayscale value (0-255)
|
|
is_blurry: bool
|
|
is_low_contrast: bool
|
|
is_degraded: bool # True if any quality issue detected
|
|
quality_pct: int # 0-100 overall quality estimate
|
|
recommended_min_conf: int # Recommended OCR confidence threshold
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
def score_scan_quality(img_bgr: np.ndarray) -> ScanQualityReport:
|
|
"""
|
|
Assess the quality of a scanned image.
|
|
|
|
Uses:
|
|
- Laplacian variance for blur detection
|
|
- Grayscale standard deviation for contrast
|
|
- Mean brightness for exposure assessment
|
|
|
|
Args:
|
|
img_bgr: BGR image (numpy array from OpenCV)
|
|
|
|
Returns:
|
|
ScanQualityReport with scores and recommendations
|
|
"""
|
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Blur detection: Laplacian variance
|
|
# Higher = sharper edges = better quality
|
|
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
|
|
blur_score = float(laplacian.var())
|
|
|
|
# Contrast: standard deviation of grayscale
|
|
contrast_score = float(np.std(gray))
|
|
|
|
# Brightness: mean grayscale
|
|
brightness = float(np.mean(gray))
|
|
|
|
# Quality flags
|
|
is_blurry = blur_score < BLUR_THRESHOLD
|
|
is_low_contrast = contrast_score < CONTRAST_THRESHOLD
|
|
is_degraded = is_blurry or is_low_contrast
|
|
|
|
# Overall quality percentage (simple weighted combination)
|
|
blur_pct = min(100, blur_score / BLUR_THRESHOLD * 50)
|
|
contrast_pct = min(100, contrast_score / CONTRAST_THRESHOLD * 50)
|
|
quality_pct = int(min(100, blur_pct + contrast_pct))
|
|
|
|
# Recommended confidence threshold
|
|
recommended_min_conf = CONFIDENCE_DEGRADED if is_degraded else CONFIDENCE_GOOD
|
|
|
|
report = ScanQualityReport(
|
|
blur_score=round(blur_score, 1),
|
|
contrast_score=round(contrast_score, 1),
|
|
brightness=round(brightness, 1),
|
|
is_blurry=is_blurry,
|
|
is_low_contrast=is_low_contrast,
|
|
is_degraded=is_degraded,
|
|
quality_pct=quality_pct,
|
|
recommended_min_conf=recommended_min_conf,
|
|
)
|
|
|
|
logger.info(
|
|
f"Scan quality: blur={report.blur_score} "
|
|
f"contrast={report.contrast_score} "
|
|
f"quality={report.quality_pct}% "
|
|
f"degraded={report.is_degraded} "
|
|
f"min_conf={report.recommended_min_conf}"
|
|
)
|
|
|
|
return report
|