Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s

Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts
OCR confidence threshold (40 for good scans, 30 for degraded).
Quality report included in API response + shown in frontend.

Step 2: max_columns parameter in cv_words_first.py — limits column
detection to 3 for vocab tables, preventing phantom columns D/E
from degraded OCR fragments.

Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter
denoising + unsharp mask, only for degraded scans (gated by
quality score). Pattern from handwriting_htr_api.py.

Frontend: quality info shown in extraction status after processing.
Reprocess button now derives pages from vocabulary data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-23 14:58:39 +02:00
parent 5a154b744d
commit 2f34ee9ede
5 changed files with 267 additions and 16 deletions

View File

@@ -35,9 +35,15 @@ def _cluster_columns(
words: List[Dict],
img_w: int,
min_gap_pct: float = 3.0,
max_columns: Optional[int] = None,
) -> List[Dict[str, Any]]:
"""Cluster words into columns by finding large horizontal gaps.
Args:
max_columns: If set, limits the number of columns by merging
the closest adjacent pairs until the count matches.
Prevents phantom columns from degraded OCR.
Returns a list of column dicts:
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
sorted left-to-right.
@@ -57,17 +63,28 @@ def _cluster_columns(
# Find X-gap boundaries between consecutive words (sorted by X-center)
# For each word, compute right edge; for next word, compute left edge
boundaries: List[float] = [] # X positions where columns split
# Collect gaps with their sizes for max_columns enforcement
gaps: List[Tuple[float, float]] = [] # (gap_size, split_x)
for i in range(len(sorted_w) - 1):
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
left_edge = sorted_w[i + 1]['left']
gap = left_edge - right_edge
if gap > min_gap_px:
# Split point is midway through the gap
boundaries.append((right_edge + left_edge) / 2)
split_x = (right_edge + left_edge) / 2
gaps.append((gap, split_x))
# If max_columns is set, keep only the (max_columns - 1) largest gaps
if max_columns and len(gaps) >= max_columns:
gaps.sort(key=lambda g: g[0], reverse=True)
gaps = gaps[:max_columns - 1]
logger.info(
f"_cluster_columns: limited to {max_columns} columns "
f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
)
boundaries = sorted(g[1] for g in gaps)
# Build column ranges from boundaries
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
col_edges = [0.0] + boundaries + [float(img_w)]
columns = []
for ci in range(len(col_edges) - 1):
@@ -302,6 +319,7 @@ def build_grid_from_words(
img_h: int,
min_confidence: int = 30,
box_rects: Optional[List[Dict]] = None,
max_columns: Optional[int] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes.
@@ -359,8 +377,9 @@ def build_grid_from_words(
return [], []
# Step 1: cluster columns
columns = _cluster_columns(words, img_w)
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
columns = _cluster_columns(words, img_w, max_columns=max_columns)
logger.info("build_grid_from_words: %d column(s) detected%s",
len(columns), f" (max={max_columns})" if max_columns else "")
# Step 2: cluster rows
rows = _cluster_rows(words)

View File

@@ -0,0 +1,92 @@
"""
OCR Image Enhancement — Improve scan quality before OCR.
Applies CLAHE contrast enhancement + bilateral filter denoising
to degraded scans. Only runs when scan_quality.is_degraded is True.
Pattern adapted from handwriting_htr_api.py (lines 50-68) and
cv_layout.py (lines 229-241).
All operations use OpenCV (Apache-2.0).
"""
import logging
import cv2
import numpy as np
logger = logging.getLogger(__name__)
def enhance_for_ocr(
img_bgr: np.ndarray,
is_degraded: bool = False,
clip_limit: float = 3.0,
tile_size: int = 8,
denoise_d: int = 9,
denoise_sigma_color: float = 75,
denoise_sigma_space: float = 75,
sharpen: bool = True,
) -> np.ndarray:
"""
Enhance image quality for OCR processing.
Only applies aggressive enhancement when is_degraded is True.
For good scans, applies minimal enhancement (light CLAHE only).
Args:
img_bgr: Input BGR image
is_degraded: Whether the scan is degraded (from ScanQualityReport)
clip_limit: CLAHE clip limit (higher = more contrast)
tile_size: CLAHE tile grid size
denoise_d: Bilateral filter diameter
denoise_sigma_color: Bilateral filter sigma for color
denoise_sigma_space: Bilateral filter sigma for space
sharpen: Apply unsharp mask for blurry scans
Returns:
Enhanced BGR image
"""
if not is_degraded:
# For good scans: light CLAHE only (preserves quality)
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
l_enhanced = clahe.apply(l_channel)
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
result = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
logger.info("enhance_for_ocr: light CLAHE applied (good scan)")
return result
# Degraded scan: full enhancement pipeline
logger.info(
f"enhance_for_ocr: full enhancement "
f"(CLAHE clip={clip_limit}, denoise d={denoise_d}, sharpen={sharpen})"
)
# 1. CLAHE on L-channel of LAB colorspace (preserves color for RapidOCR)
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
clahe = cv2.createCLAHE(
clipLimit=clip_limit,
tileGridSize=(tile_size, tile_size),
)
l_enhanced = clahe.apply(l_channel)
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
# 2. Bilateral filter: denoises while preserving edges
enhanced = cv2.bilateralFilter(
enhanced,
d=denoise_d,
sigmaColor=denoise_sigma_color,
sigmaSpace=denoise_sigma_space,
)
# 3. Unsharp mask for sharpening blurry text
if sharpen:
gaussian = cv2.GaussianBlur(enhanced, (0, 0), 3)
enhanced = cv2.addWeighted(enhanced, 1.5, gaussian, -0.5, 0)
logger.info("enhance_for_ocr: full enhancement pipeline complete")
return enhanced

View File

@@ -0,0 +1,102 @@
"""
Scan Quality Assessment — Measures image quality before OCR.
Computes blur score, contrast score, and an overall quality rating.
Used to gate enhancement steps and warn users about degraded scans.
All operations use OpenCV (Apache-2.0), no additional dependencies.
"""
import logging
from dataclasses import dataclass, asdict
from typing import Dict, Any
import cv2
import numpy as np
logger = logging.getLogger(__name__)
# Thresholds (empirically tuned on textbook scans)
BLUR_THRESHOLD = 100.0 # Laplacian variance below this = blurry
CONTRAST_THRESHOLD = 40.0 # Grayscale stddev below this = low contrast
CONFIDENCE_GOOD = 40 # OCR min confidence for good scans
CONFIDENCE_DEGRADED = 30 # OCR min confidence for degraded scans
@dataclass
class ScanQualityReport:
"""Result of scan quality assessment."""
blur_score: float # Laplacian variance (higher = sharper)
contrast_score: float # Grayscale std deviation (higher = more contrast)
brightness: float # Mean grayscale value (0-255)
is_blurry: bool
is_low_contrast: bool
is_degraded: bool # True if any quality issue detected
quality_pct: int # 0-100 overall quality estimate
recommended_min_conf: int # Recommended OCR confidence threshold
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
def score_scan_quality(img_bgr: np.ndarray) -> ScanQualityReport:
"""
Assess the quality of a scanned image.
Uses:
- Laplacian variance for blur detection
- Grayscale standard deviation for contrast
- Mean brightness for exposure assessment
Args:
img_bgr: BGR image (numpy array from OpenCV)
Returns:
ScanQualityReport with scores and recommendations
"""
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
# Blur detection: Laplacian variance
# Higher = sharper edges = better quality
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
blur_score = float(laplacian.var())
# Contrast: standard deviation of grayscale
contrast_score = float(np.std(gray))
# Brightness: mean grayscale
brightness = float(np.mean(gray))
# Quality flags
is_blurry = blur_score < BLUR_THRESHOLD
is_low_contrast = contrast_score < CONTRAST_THRESHOLD
is_degraded = is_blurry or is_low_contrast
# Overall quality percentage (simple weighted combination)
blur_pct = min(100, blur_score / BLUR_THRESHOLD * 50)
contrast_pct = min(100, contrast_score / CONTRAST_THRESHOLD * 50)
quality_pct = int(min(100, blur_pct + contrast_pct))
# Recommended confidence threshold
recommended_min_conf = CONFIDENCE_DEGRADED if is_degraded else CONFIDENCE_GOOD
report = ScanQualityReport(
blur_score=round(blur_score, 1),
contrast_score=round(contrast_score, 1),
brightness=round(brightness, 1),
is_blurry=is_blurry,
is_low_contrast=is_low_contrast,
is_degraded=is_degraded,
quality_pct=quality_pct,
recommended_min_conf=recommended_min_conf,
)
logger.info(
f"Scan quality: blur={report.blur_score} "
f"contrast={report.contrast_score} "
f"quality={report.quality_pct}% "
f"degraded={report.is_degraded} "
f"min_conf={report.recommended_min_conf}"
)
return report

View File

@@ -1325,10 +1325,11 @@ async def process_single_page(
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
rotation_deg = 0
quality_report = None
if OCR_PIPELINE_AVAILABLE:
try:
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
@@ -1383,7 +1384,7 @@ async def process_single_page(
session["vocabulary_count"] = len(existing_vocab)
session["status"] = SessionStatus.EXTRACTED.value
return {
result = {
"session_id": session_id,
"page_number": page_number + 1,
"success": True,
@@ -1394,6 +1395,14 @@ async def process_single_page(
"rotation": rotation_deg,
}
# Add scan quality report if available
if quality_report:
result["scan_quality"] = quality_report.to_dict()
else:
quality_report = None # ensure variable exists for non-pipeline path
return result
async def _run_ocr_pipeline_for_page(
img_bgr: np.ndarray,
@@ -1471,6 +1480,26 @@ async def _run_ocr_pipeline_for_page(
except Exception as e:
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
# 5b. Scan quality assessment
scan_quality_report = None
try:
from scan_quality import score_scan_quality
scan_quality_report = score_scan_quality(dewarped_bgr)
except Exception as e:
logger.warning(f" scan quality: failed ({e})")
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
# 5c. Image enhancement for degraded scans
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
if is_degraded:
try:
from ocr_image_enhance import enhance_for_ocr
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
logger.info(" enhancement: applied (degraded scan)")
except Exception as e:
logger.warning(f" enhancement: failed ({e})")
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
t0 = _time.time()
img_h, img_w = dewarped_bgr.shape[:2]
@@ -1498,7 +1527,7 @@ async def _run_ocr_pipeline_for_page(
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < 20:
if not text or conf < min_ocr_conf:
continue
tess_words.append({
"text": text,
@@ -1518,8 +1547,8 @@ async def _run_ocr_pipeline_for_page(
else:
merged_words = tess_words # fallback to Tesseract only
# Build initial grid from merged words
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
# Build initial grid from merged words (limit to 3 columns for vocab tables)
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=3)
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
@@ -1743,7 +1772,7 @@ async def _run_ocr_pipeline_for_page(
logger.info(f"Kombi Pipeline page {page_number + 1}: "
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
return page_vocabulary, rotation
return page_vocabulary, rotation, scan_quality_report
@router.post("/sessions/{session_id}/process-pages")

View File

@@ -355,7 +355,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
}
}
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string }> => {
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string; scanQuality?: any }> => {
const API_BASE = getApiBase()
try {
@@ -377,7 +377,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
return { success: false, vocabulary: [], error: data.error || `Seite ${pageIndex + 1}: Unbekannter Fehler` }
}
return { success: true, vocabulary: data.vocabulary || [] }
return { success: true, vocabulary: data.vocabulary || [], scanQuality: data.scan_quality }
} catch (e) {
return { success: false, vocabulary: [], error: `Seite ${pageIndex + 1}: ${e instanceof Error ? e.message : 'Netzwerkfehler'}` }
}
@@ -413,7 +413,10 @@ export function useVocabWorksheet(): VocabWorksheetHook {
successful.push(pageIndex + 1)
setSuccessfulPages([...successful])
setVocabulary(prev => [...prev, ...result.vocabulary])
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden`)
const qualityInfo = result.scanQuality
? ` | Qualitaet: ${result.scanQuality.quality_pct}%${result.scanQuality.is_degraded ? ' (degradiert!)' : ''}`
: ''
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden${qualityInfo}`)
} else {
failed.push(pageIndex + 1)
setFailedPages([...failed])
@@ -786,7 +789,9 @@ export function useVocabWorksheet(): VocabWorksheetHook {
;(async () => {
const allVocab: VocabularyEntry[] = []
let lastQuality: any = null
for (const pageIndex of pagesToReprocess) {
setExtractionStatus(`Verarbeite Seite ${pageIndex + 1}...`)
try {
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session.id}/process-single-page/${pageIndex}?ipa_mode=${ipa}&syllable_mode=${syllable}`, {
method: 'POST',
@@ -796,12 +801,16 @@ export function useVocabWorksheet(): VocabWorksheetHook {
if (res.ok) {
const data = await res.json()
if (data.vocabulary) allVocab.push(...data.vocabulary)
if (data.scan_quality) lastQuality = data.scan_quality
}
} catch {}
}
setVocabulary(allVocab)
setIsExtracting(false)
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen`)
const qualityInfo = lastQuality
? ` | Qualitaet: ${lastQuality.quality_pct}%${lastQuality.is_degraded ? ' (degradiert!)' : ''} | Blur: ${lastQuality.blur_score} | Kontrast: ${lastQuality.contrast_score}`
: ''
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen${qualityInfo}`)
})()
}