diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py index 19f77cd..a4756e6 100644 --- a/klausur-service/backend/cv_words_first.py +++ b/klausur-service/backend/cv_words_first.py @@ -35,9 +35,15 @@ def _cluster_columns( words: List[Dict], img_w: int, min_gap_pct: float = 3.0, + max_columns: Optional[int] = None, ) -> List[Dict[str, Any]]: """Cluster words into columns by finding large horizontal gaps. + Args: + max_columns: If set, limits the number of columns by merging + the closest adjacent pairs until the count matches. + Prevents phantom columns from degraded OCR. + Returns a list of column dicts: [{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...] sorted left-to-right. @@ -57,17 +63,28 @@ def _cluster_columns( # Find X-gap boundaries between consecutive words (sorted by X-center) # For each word, compute right edge; for next word, compute left edge - boundaries: List[float] = [] # X positions where columns split + # Collect gaps with their sizes for max_columns enforcement + gaps: List[Tuple[float, float]] = [] # (gap_size, split_x) for i in range(len(sorted_w) - 1): right_edge = sorted_w[i]['left'] + sorted_w[i]['width'] left_edge = sorted_w[i + 1]['left'] gap = left_edge - right_edge if gap > min_gap_px: - # Split point is midway through the gap - boundaries.append((right_edge + left_edge) / 2) + split_x = (right_edge + left_edge) / 2 + gaps.append((gap, split_x)) + + # If max_columns is set, keep only the (max_columns - 1) largest gaps + if max_columns and len(gaps) >= max_columns: + gaps.sort(key=lambda g: g[0], reverse=True) + gaps = gaps[:max_columns - 1] + logger.info( + f"_cluster_columns: limited to {max_columns} columns " + f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)" + ) + + boundaries = sorted(g[1] for g in gaps) # Build column ranges from boundaries - # Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf) col_edges = [0.0] + boundaries + [float(img_w)] columns = [] for ci in range(len(col_edges) - 1): @@ -302,6 +319,7 @@ def build_grid_from_words( img_h: int, min_confidence: int = 30, box_rects: Optional[List[Dict]] = None, + max_columns: Optional[int] = None, ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """Build a cell grid bottom-up from Tesseract word boxes. @@ -359,8 +377,9 @@ def build_grid_from_words( return [], [] # Step 1: cluster columns - columns = _cluster_columns(words, img_w) - logger.info("build_grid_from_words: %d column(s) detected", len(columns)) + columns = _cluster_columns(words, img_w, max_columns=max_columns) + logger.info("build_grid_from_words: %d column(s) detected%s", + len(columns), f" (max={max_columns})" if max_columns else "") # Step 2: cluster rows rows = _cluster_rows(words) diff --git a/klausur-service/backend/ocr_image_enhance.py b/klausur-service/backend/ocr_image_enhance.py new file mode 100644 index 0000000..6b23d94 --- /dev/null +++ b/klausur-service/backend/ocr_image_enhance.py @@ -0,0 +1,92 @@ +""" +OCR Image Enhancement — Improve scan quality before OCR. + +Applies CLAHE contrast enhancement + bilateral filter denoising +to degraded scans. Only runs when scan_quality.is_degraded is True. + +Pattern adapted from handwriting_htr_api.py (lines 50-68) and +cv_layout.py (lines 229-241). + +All operations use OpenCV (Apache-2.0). +""" + +import logging + +import cv2 +import numpy as np + +logger = logging.getLogger(__name__) + + +def enhance_for_ocr( + img_bgr: np.ndarray, + is_degraded: bool = False, + clip_limit: float = 3.0, + tile_size: int = 8, + denoise_d: int = 9, + denoise_sigma_color: float = 75, + denoise_sigma_space: float = 75, + sharpen: bool = True, +) -> np.ndarray: + """ + Enhance image quality for OCR processing. + + Only applies aggressive enhancement when is_degraded is True. + For good scans, applies minimal enhancement (light CLAHE only). + + Args: + img_bgr: Input BGR image + is_degraded: Whether the scan is degraded (from ScanQualityReport) + clip_limit: CLAHE clip limit (higher = more contrast) + tile_size: CLAHE tile grid size + denoise_d: Bilateral filter diameter + denoise_sigma_color: Bilateral filter sigma for color + denoise_sigma_space: Bilateral filter sigma for space + sharpen: Apply unsharp mask for blurry scans + + Returns: + Enhanced BGR image + """ + if not is_degraded: + # For good scans: light CLAHE only (preserves quality) + lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB) + l_channel, a_channel, b_channel = cv2.split(lab) + clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) + l_enhanced = clahe.apply(l_channel) + lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel]) + result = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR) + logger.info("enhance_for_ocr: light CLAHE applied (good scan)") + return result + + # Degraded scan: full enhancement pipeline + logger.info( + f"enhance_for_ocr: full enhancement " + f"(CLAHE clip={clip_limit}, denoise d={denoise_d}, sharpen={sharpen})" + ) + + # 1. CLAHE on L-channel of LAB colorspace (preserves color for RapidOCR) + lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB) + l_channel, a_channel, b_channel = cv2.split(lab) + clahe = cv2.createCLAHE( + clipLimit=clip_limit, + tileGridSize=(tile_size, tile_size), + ) + l_enhanced = clahe.apply(l_channel) + lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel]) + enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR) + + # 2. Bilateral filter: denoises while preserving edges + enhanced = cv2.bilateralFilter( + enhanced, + d=denoise_d, + sigmaColor=denoise_sigma_color, + sigmaSpace=denoise_sigma_space, + ) + + # 3. Unsharp mask for sharpening blurry text + if sharpen: + gaussian = cv2.GaussianBlur(enhanced, (0, 0), 3) + enhanced = cv2.addWeighted(enhanced, 1.5, gaussian, -0.5, 0) + + logger.info("enhance_for_ocr: full enhancement pipeline complete") + return enhanced diff --git a/klausur-service/backend/scan_quality.py b/klausur-service/backend/scan_quality.py new file mode 100644 index 0000000..d869140 --- /dev/null +++ b/klausur-service/backend/scan_quality.py @@ -0,0 +1,102 @@ +""" +Scan Quality Assessment — Measures image quality before OCR. + +Computes blur score, contrast score, and an overall quality rating. +Used to gate enhancement steps and warn users about degraded scans. + +All operations use OpenCV (Apache-2.0), no additional dependencies. +""" + +import logging +from dataclasses import dataclass, asdict +from typing import Dict, Any + +import cv2 +import numpy as np + +logger = logging.getLogger(__name__) + +# Thresholds (empirically tuned on textbook scans) +BLUR_THRESHOLD = 100.0 # Laplacian variance below this = blurry +CONTRAST_THRESHOLD = 40.0 # Grayscale stddev below this = low contrast +CONFIDENCE_GOOD = 40 # OCR min confidence for good scans +CONFIDENCE_DEGRADED = 30 # OCR min confidence for degraded scans + + +@dataclass +class ScanQualityReport: + """Result of scan quality assessment.""" + blur_score: float # Laplacian variance (higher = sharper) + contrast_score: float # Grayscale std deviation (higher = more contrast) + brightness: float # Mean grayscale value (0-255) + is_blurry: bool + is_low_contrast: bool + is_degraded: bool # True if any quality issue detected + quality_pct: int # 0-100 overall quality estimate + recommended_min_conf: int # Recommended OCR confidence threshold + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +def score_scan_quality(img_bgr: np.ndarray) -> ScanQualityReport: + """ + Assess the quality of a scanned image. + + Uses: + - Laplacian variance for blur detection + - Grayscale standard deviation for contrast + - Mean brightness for exposure assessment + + Args: + img_bgr: BGR image (numpy array from OpenCV) + + Returns: + ScanQualityReport with scores and recommendations + """ + gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + + # Blur detection: Laplacian variance + # Higher = sharper edges = better quality + laplacian = cv2.Laplacian(gray, cv2.CV_64F) + blur_score = float(laplacian.var()) + + # Contrast: standard deviation of grayscale + contrast_score = float(np.std(gray)) + + # Brightness: mean grayscale + brightness = float(np.mean(gray)) + + # Quality flags + is_blurry = blur_score < BLUR_THRESHOLD + is_low_contrast = contrast_score < CONTRAST_THRESHOLD + is_degraded = is_blurry or is_low_contrast + + # Overall quality percentage (simple weighted combination) + blur_pct = min(100, blur_score / BLUR_THRESHOLD * 50) + contrast_pct = min(100, contrast_score / CONTRAST_THRESHOLD * 50) + quality_pct = int(min(100, blur_pct + contrast_pct)) + + # Recommended confidence threshold + recommended_min_conf = CONFIDENCE_DEGRADED if is_degraded else CONFIDENCE_GOOD + + report = ScanQualityReport( + blur_score=round(blur_score, 1), + contrast_score=round(contrast_score, 1), + brightness=round(brightness, 1), + is_blurry=is_blurry, + is_low_contrast=is_low_contrast, + is_degraded=is_degraded, + quality_pct=quality_pct, + recommended_min_conf=recommended_min_conf, + ) + + logger.info( + f"Scan quality: blur={report.blur_score} " + f"contrast={report.contrast_score} " + f"quality={report.quality_pct}% " + f"degraded={report.is_degraded} " + f"min_conf={report.recommended_min_conf}" + ) + + return report diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 91e7307..01a6c2f 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -1325,10 +1325,11 @@ async def process_single_page( # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) --- rotation_deg = 0 + quality_report = None if OCR_PIPELINE_AVAILABLE: try: img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) - page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page( + page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page( img_bgr, page_number, session_id, ipa_mode=ipa_mode, syllable_mode=syllable_mode, ) @@ -1383,7 +1384,7 @@ async def process_single_page( session["vocabulary_count"] = len(existing_vocab) session["status"] = SessionStatus.EXTRACTED.value - return { + result = { "session_id": session_id, "page_number": page_number + 1, "success": True, @@ -1394,6 +1395,14 @@ async def process_single_page( "rotation": rotation_deg, } + # Add scan quality report if available + if quality_report: + result["scan_quality"] = quality_report.to_dict() + else: + quality_report = None # ensure variable exists for non-pipeline path + + return result + async def _run_ocr_pipeline_for_page( img_bgr: np.ndarray, @@ -1471,6 +1480,26 @@ async def _run_ocr_pipeline_for_page( except Exception as e: logger.warning(f" crop: failed ({e}), continuing with uncropped image") + # 5b. Scan quality assessment + scan_quality_report = None + try: + from scan_quality import score_scan_quality + scan_quality_report = score_scan_quality(dewarped_bgr) + except Exception as e: + logger.warning(f" scan quality: failed ({e})") + + min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40 + + # 5c. Image enhancement for degraded scans + is_degraded = scan_quality_report.is_degraded if scan_quality_report else False + if is_degraded: + try: + from ocr_image_enhance import enhance_for_ocr + dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True) + logger.info(" enhancement: applied (degraded scan)") + except Exception as e: + logger.warning(f" enhancement: failed ({e})") + # 6. Dual-engine OCR (RapidOCR + Tesseract → merge) t0 = _time.time() img_h, img_w = dewarped_bgr.shape[:2] @@ -1498,7 +1527,7 @@ async def _run_ocr_pipeline_for_page( text = str(data["text"][i]).strip() conf_raw = str(data["conf"][i]) conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 - if not text or conf < 20: + if not text or conf < min_ocr_conf: continue tess_words.append({ "text": text, @@ -1518,8 +1547,8 @@ async def _run_ocr_pipeline_for_page( else: merged_words = tess_words # fallback to Tesseract only - # Build initial grid from merged words - cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h) + # Build initial grid from merged words (limit to 3 columns for vocab tables) + cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=3) for cell in cells: cell["ocr_engine"] = "rapid_kombi" @@ -1743,7 +1772,7 @@ async def _run_ocr_pipeline_for_page( logger.info(f"Kombi Pipeline page {page_number + 1}: " f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s") - return page_vocabulary, rotation + return page_vocabulary, rotation, scan_quality_report @router.post("/sessions/{session_id}/process-pages") diff --git a/studio-v2/app/vocab-worksheet/useVocabWorksheet.ts b/studio-v2/app/vocab-worksheet/useVocabWorksheet.ts index 72a2ee3..a286d81 100644 --- a/studio-v2/app/vocab-worksheet/useVocabWorksheet.ts +++ b/studio-v2/app/vocab-worksheet/useVocabWorksheet.ts @@ -355,7 +355,7 @@ export function useVocabWorksheet(): VocabWorksheetHook { } } - const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string }> => { + const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string; scanQuality?: any }> => { const API_BASE = getApiBase() try { @@ -377,7 +377,7 @@ export function useVocabWorksheet(): VocabWorksheetHook { return { success: false, vocabulary: [], error: data.error || `Seite ${pageIndex + 1}: Unbekannter Fehler` } } - return { success: true, vocabulary: data.vocabulary || [] } + return { success: true, vocabulary: data.vocabulary || [], scanQuality: data.scan_quality } } catch (e) { return { success: false, vocabulary: [], error: `Seite ${pageIndex + 1}: ${e instanceof Error ? e.message : 'Netzwerkfehler'}` } } @@ -413,7 +413,10 @@ export function useVocabWorksheet(): VocabWorksheetHook { successful.push(pageIndex + 1) setSuccessfulPages([...successful]) setVocabulary(prev => [...prev, ...result.vocabulary]) - setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden`) + const qualityInfo = result.scanQuality + ? ` | Qualitaet: ${result.scanQuality.quality_pct}%${result.scanQuality.is_degraded ? ' (degradiert!)' : ''}` + : '' + setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden${qualityInfo}`) } else { failed.push(pageIndex + 1) setFailedPages([...failed]) @@ -786,7 +789,9 @@ export function useVocabWorksheet(): VocabWorksheetHook { ;(async () => { const allVocab: VocabularyEntry[] = [] + let lastQuality: any = null for (const pageIndex of pagesToReprocess) { + setExtractionStatus(`Verarbeite Seite ${pageIndex + 1}...`) try { const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session.id}/process-single-page/${pageIndex}?ipa_mode=${ipa}&syllable_mode=${syllable}`, { method: 'POST', @@ -796,12 +801,16 @@ export function useVocabWorksheet(): VocabWorksheetHook { if (res.ok) { const data = await res.json() if (data.vocabulary) allVocab.push(...data.vocabulary) + if (data.scan_quality) lastQuality = data.scan_quality } } catch {} } setVocabulary(allVocab) setIsExtracting(false) - setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen`) + const qualityInfo = lastQuality + ? ` | Qualitaet: ${lastQuality.quality_pct}%${lastQuality.is_degraded ? ' (degradiert!)' : ''} | Blur: ${lastQuality.blur_score} | Kontrast: ${lastQuality.contrast_score}` + : '' + setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen${qualityInfo}`) })() }