Add scan quality scoring, column limit, image enhancement (Steps 1-3)

Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 14:58:39 +02:00
parent 5a154b744d
commit 2f34ee9ede
5 changed files with 267 additions and 16 deletions
--- a/klausur-service/backend/scan_quality.py
+++ b/klausur-service/backend/scan_quality.py
@@ -0,0 +1,102 @@
+"""
+Scan Quality Assessment — Measures image quality before OCR.
+
+Computes blur score, contrast score, and an overall quality rating.
+Used to gate enhancement steps and warn users about degraded scans.
+
+All operations use OpenCV (Apache-2.0), no additional dependencies.
+"""
+
+import logging
+from dataclasses import dataclass, asdict
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# Thresholds (empirically tuned on textbook scans)
+BLUR_THRESHOLD = 100.0       # Laplacian variance below this = blurry
+CONTRAST_THRESHOLD = 40.0    # Grayscale stddev below this = low contrast
+CONFIDENCE_GOOD = 40         # OCR min confidence for good scans
+CONFIDENCE_DEGRADED = 30     # OCR min confidence for degraded scans
+
+
+@dataclass
+class ScanQualityReport:
+    """Result of scan quality assessment."""
+    blur_score: float         # Laplacian variance (higher = sharper)
+    contrast_score: float     # Grayscale std deviation (higher = more contrast)
+    brightness: float         # Mean grayscale value (0-255)
+    is_blurry: bool
+    is_low_contrast: bool
+    is_degraded: bool         # True if any quality issue detected
+    quality_pct: int          # 0-100 overall quality estimate
+    recommended_min_conf: int # Recommended OCR confidence threshold
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+def score_scan_quality(img_bgr: np.ndarray) -> ScanQualityReport:
+    """
+    Assess the quality of a scanned image.
+
+    Uses:
+    - Laplacian variance for blur detection
+    - Grayscale standard deviation for contrast
+    - Mean brightness for exposure assessment
+
+    Args:
+        img_bgr: BGR image (numpy array from OpenCV)
+
+    Returns:
+        ScanQualityReport with scores and recommendations
+    """
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Blur detection: Laplacian variance
+    # Higher = sharper edges = better quality
+    laplacian = cv2.Laplacian(gray, cv2.CV_64F)
+    blur_score = float(laplacian.var())
+
+    # Contrast: standard deviation of grayscale
+    contrast_score = float(np.std(gray))
+
+    # Brightness: mean grayscale
+    brightness = float(np.mean(gray))
+
+    # Quality flags
+    is_blurry = blur_score < BLUR_THRESHOLD
+    is_low_contrast = contrast_score < CONTRAST_THRESHOLD
+    is_degraded = is_blurry or is_low_contrast
+
+    # Overall quality percentage (simple weighted combination)
+    blur_pct = min(100, blur_score / BLUR_THRESHOLD * 50)
+    contrast_pct = min(100, contrast_score / CONTRAST_THRESHOLD * 50)
+    quality_pct = int(min(100, blur_pct + contrast_pct))
+
+    # Recommended confidence threshold
+    recommended_min_conf = CONFIDENCE_DEGRADED if is_degraded else CONFIDENCE_GOOD
+
+    report = ScanQualityReport(
+        blur_score=round(blur_score, 1),
+        contrast_score=round(contrast_score, 1),
+        brightness=round(brightness, 1),
+        is_blurry=is_blurry,
+        is_low_contrast=is_low_contrast,
+        is_degraded=is_degraded,
+        quality_pct=quality_pct,
+        recommended_min_conf=recommended_min_conf,
+    )
+
+    logger.info(
+        f"Scan quality: blur={report.blur_score} "
+        f"contrast={report.contrast_score} "
+        f"quality={report.quality_pct}% "
+        f"degraded={report.is_degraded} "
+        f"min_conf={report.recommended_min_conf}"
+    )
+
+    return report