""" Scan Quality Assessment — Measures image quality before OCR. Computes blur score, contrast score, and an overall quality rating. Used to gate enhancement steps and warn users about degraded scans. All operations use OpenCV (Apache-2.0), no additional dependencies. """ import logging from dataclasses import dataclass, asdict from typing import Dict, Any import cv2 import numpy as np logger = logging.getLogger(__name__) # Thresholds (empirically tuned on textbook scans) BLUR_THRESHOLD = 100.0 # Laplacian variance below this = blurry CONTRAST_THRESHOLD = 40.0 # Grayscale stddev below this = low contrast CONFIDENCE_GOOD = 40 # OCR min confidence for good scans CONFIDENCE_DEGRADED = 30 # OCR min confidence for degraded scans @dataclass class ScanQualityReport: """Result of scan quality assessment.""" blur_score: float # Laplacian variance (higher = sharper) contrast_score: float # Grayscale std deviation (higher = more contrast) brightness: float # Mean grayscale value (0-255) is_blurry: bool is_low_contrast: bool is_degraded: bool # True if any quality issue detected quality_pct: int # 0-100 overall quality estimate recommended_min_conf: int # Recommended OCR confidence threshold def to_dict(self) -> Dict[str, Any]: return asdict(self) def score_scan_quality(img_bgr: np.ndarray) -> ScanQualityReport: """ Assess the quality of a scanned image. Uses: - Laplacian variance for blur detection - Grayscale standard deviation for contrast - Mean brightness for exposure assessment Args: img_bgr: BGR image (numpy array from OpenCV) Returns: ScanQualityReport with scores and recommendations """ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) # Blur detection: Laplacian variance # Higher = sharper edges = better quality laplacian = cv2.Laplacian(gray, cv2.CV_64F) blur_score = float(laplacian.var()) # Contrast: standard deviation of grayscale contrast_score = float(np.std(gray)) # Brightness: mean grayscale brightness = float(np.mean(gray)) # Quality flags is_blurry = blur_score < BLUR_THRESHOLD is_low_contrast = contrast_score < CONTRAST_THRESHOLD is_degraded = is_blurry or is_low_contrast # Overall quality percentage (simple weighted combination) blur_pct = min(100, blur_score / BLUR_THRESHOLD * 50) contrast_pct = min(100, contrast_score / CONTRAST_THRESHOLD * 50) quality_pct = int(min(100, blur_pct + contrast_pct)) # Recommended confidence threshold recommended_min_conf = CONFIDENCE_DEGRADED if is_degraded else CONFIDENCE_GOOD report = ScanQualityReport( blur_score=round(blur_score, 1), contrast_score=round(contrast_score, 1), brightness=round(brightness, 1), is_blurry=is_blurry, is_low_contrast=is_low_contrast, is_degraded=is_degraded, quality_pct=quality_pct, recommended_min_conf=recommended_min_conf, ) logger.info( f"Scan quality: blur={report.blur_score} " f"contrast={report.contrast_score} " f"quality={report.quality_pct}% " f"degraded={report.is_degraded} " f"min_conf={report.recommended_min_conf}" ) return report