Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -35,9 +35,15 @@ def _cluster_columns(
|
||||
words: List[Dict],
|
||||
img_w: int,
|
||||
min_gap_pct: float = 3.0,
|
||||
max_columns: Optional[int] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Cluster words into columns by finding large horizontal gaps.
|
||||
|
||||
Args:
|
||||
max_columns: If set, limits the number of columns by merging
|
||||
the closest adjacent pairs until the count matches.
|
||||
Prevents phantom columns from degraded OCR.
|
||||
|
||||
Returns a list of column dicts:
|
||||
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
|
||||
sorted left-to-right.
|
||||
@@ -57,17 +63,28 @@ def _cluster_columns(
|
||||
|
||||
# Find X-gap boundaries between consecutive words (sorted by X-center)
|
||||
# For each word, compute right edge; for next word, compute left edge
|
||||
boundaries: List[float] = [] # X positions where columns split
|
||||
# Collect gaps with their sizes for max_columns enforcement
|
||||
gaps: List[Tuple[float, float]] = [] # (gap_size, split_x)
|
||||
for i in range(len(sorted_w) - 1):
|
||||
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
|
||||
left_edge = sorted_w[i + 1]['left']
|
||||
gap = left_edge - right_edge
|
||||
if gap > min_gap_px:
|
||||
# Split point is midway through the gap
|
||||
boundaries.append((right_edge + left_edge) / 2)
|
||||
split_x = (right_edge + left_edge) / 2
|
||||
gaps.append((gap, split_x))
|
||||
|
||||
# If max_columns is set, keep only the (max_columns - 1) largest gaps
|
||||
if max_columns and len(gaps) >= max_columns:
|
||||
gaps.sort(key=lambda g: g[0], reverse=True)
|
||||
gaps = gaps[:max_columns - 1]
|
||||
logger.info(
|
||||
f"_cluster_columns: limited to {max_columns} columns "
|
||||
f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
|
||||
)
|
||||
|
||||
boundaries = sorted(g[1] for g in gaps)
|
||||
|
||||
# Build column ranges from boundaries
|
||||
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
|
||||
col_edges = [0.0] + boundaries + [float(img_w)]
|
||||
columns = []
|
||||
for ci in range(len(col_edges) - 1):
|
||||
@@ -302,6 +319,7 @@ def build_grid_from_words(
|
||||
img_h: int,
|
||||
min_confidence: int = 30,
|
||||
box_rects: Optional[List[Dict]] = None,
|
||||
max_columns: Optional[int] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Build a cell grid bottom-up from Tesseract word boxes.
|
||||
|
||||
@@ -359,8 +377,9 @@ def build_grid_from_words(
|
||||
return [], []
|
||||
|
||||
# Step 1: cluster columns
|
||||
columns = _cluster_columns(words, img_w)
|
||||
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
|
||||
columns = _cluster_columns(words, img_w, max_columns=max_columns)
|
||||
logger.info("build_grid_from_words: %d column(s) detected%s",
|
||||
len(columns), f" (max={max_columns})" if max_columns else "")
|
||||
|
||||
# Step 2: cluster rows
|
||||
rows = _cluster_rows(words)
|
||||
|
||||
92
klausur-service/backend/ocr_image_enhance.py
Normal file
92
klausur-service/backend/ocr_image_enhance.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
OCR Image Enhancement — Improve scan quality before OCR.
|
||||
|
||||
Applies CLAHE contrast enhancement + bilateral filter denoising
|
||||
to degraded scans. Only runs when scan_quality.is_degraded is True.
|
||||
|
||||
Pattern adapted from handwriting_htr_api.py (lines 50-68) and
|
||||
cv_layout.py (lines 229-241).
|
||||
|
||||
All operations use OpenCV (Apache-2.0).
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def enhance_for_ocr(
|
||||
img_bgr: np.ndarray,
|
||||
is_degraded: bool = False,
|
||||
clip_limit: float = 3.0,
|
||||
tile_size: int = 8,
|
||||
denoise_d: int = 9,
|
||||
denoise_sigma_color: float = 75,
|
||||
denoise_sigma_space: float = 75,
|
||||
sharpen: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Enhance image quality for OCR processing.
|
||||
|
||||
Only applies aggressive enhancement when is_degraded is True.
|
||||
For good scans, applies minimal enhancement (light CLAHE only).
|
||||
|
||||
Args:
|
||||
img_bgr: Input BGR image
|
||||
is_degraded: Whether the scan is degraded (from ScanQualityReport)
|
||||
clip_limit: CLAHE clip limit (higher = more contrast)
|
||||
tile_size: CLAHE tile grid size
|
||||
denoise_d: Bilateral filter diameter
|
||||
denoise_sigma_color: Bilateral filter sigma for color
|
||||
denoise_sigma_space: Bilateral filter sigma for space
|
||||
sharpen: Apply unsharp mask for blurry scans
|
||||
|
||||
Returns:
|
||||
Enhanced BGR image
|
||||
"""
|
||||
if not is_degraded:
|
||||
# For good scans: light CLAHE only (preserves quality)
|
||||
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
|
||||
l_channel, a_channel, b_channel = cv2.split(lab)
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
l_enhanced = clahe.apply(l_channel)
|
||||
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
|
||||
result = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
|
||||
logger.info("enhance_for_ocr: light CLAHE applied (good scan)")
|
||||
return result
|
||||
|
||||
# Degraded scan: full enhancement pipeline
|
||||
logger.info(
|
||||
f"enhance_for_ocr: full enhancement "
|
||||
f"(CLAHE clip={clip_limit}, denoise d={denoise_d}, sharpen={sharpen})"
|
||||
)
|
||||
|
||||
# 1. CLAHE on L-channel of LAB colorspace (preserves color for RapidOCR)
|
||||
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
|
||||
l_channel, a_channel, b_channel = cv2.split(lab)
|
||||
clahe = cv2.createCLAHE(
|
||||
clipLimit=clip_limit,
|
||||
tileGridSize=(tile_size, tile_size),
|
||||
)
|
||||
l_enhanced = clahe.apply(l_channel)
|
||||
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
|
||||
enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
|
||||
|
||||
# 2. Bilateral filter: denoises while preserving edges
|
||||
enhanced = cv2.bilateralFilter(
|
||||
enhanced,
|
||||
d=denoise_d,
|
||||
sigmaColor=denoise_sigma_color,
|
||||
sigmaSpace=denoise_sigma_space,
|
||||
)
|
||||
|
||||
# 3. Unsharp mask for sharpening blurry text
|
||||
if sharpen:
|
||||
gaussian = cv2.GaussianBlur(enhanced, (0, 0), 3)
|
||||
enhanced = cv2.addWeighted(enhanced, 1.5, gaussian, -0.5, 0)
|
||||
|
||||
logger.info("enhance_for_ocr: full enhancement pipeline complete")
|
||||
return enhanced
|
||||
102
klausur-service/backend/scan_quality.py
Normal file
102
klausur-service/backend/scan_quality.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""
|
||||
Scan Quality Assessment — Measures image quality before OCR.
|
||||
|
||||
Computes blur score, contrast score, and an overall quality rating.
|
||||
Used to gate enhancement steps and warn users about degraded scans.
|
||||
|
||||
All operations use OpenCV (Apache-2.0), no additional dependencies.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Thresholds (empirically tuned on textbook scans)
|
||||
BLUR_THRESHOLD = 100.0 # Laplacian variance below this = blurry
|
||||
CONTRAST_THRESHOLD = 40.0 # Grayscale stddev below this = low contrast
|
||||
CONFIDENCE_GOOD = 40 # OCR min confidence for good scans
|
||||
CONFIDENCE_DEGRADED = 30 # OCR min confidence for degraded scans
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScanQualityReport:
|
||||
"""Result of scan quality assessment."""
|
||||
blur_score: float # Laplacian variance (higher = sharper)
|
||||
contrast_score: float # Grayscale std deviation (higher = more contrast)
|
||||
brightness: float # Mean grayscale value (0-255)
|
||||
is_blurry: bool
|
||||
is_low_contrast: bool
|
||||
is_degraded: bool # True if any quality issue detected
|
||||
quality_pct: int # 0-100 overall quality estimate
|
||||
recommended_min_conf: int # Recommended OCR confidence threshold
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def score_scan_quality(img_bgr: np.ndarray) -> ScanQualityReport:
|
||||
"""
|
||||
Assess the quality of a scanned image.
|
||||
|
||||
Uses:
|
||||
- Laplacian variance for blur detection
|
||||
- Grayscale standard deviation for contrast
|
||||
- Mean brightness for exposure assessment
|
||||
|
||||
Args:
|
||||
img_bgr: BGR image (numpy array from OpenCV)
|
||||
|
||||
Returns:
|
||||
ScanQualityReport with scores and recommendations
|
||||
"""
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Blur detection: Laplacian variance
|
||||
# Higher = sharper edges = better quality
|
||||
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
|
||||
blur_score = float(laplacian.var())
|
||||
|
||||
# Contrast: standard deviation of grayscale
|
||||
contrast_score = float(np.std(gray))
|
||||
|
||||
# Brightness: mean grayscale
|
||||
brightness = float(np.mean(gray))
|
||||
|
||||
# Quality flags
|
||||
is_blurry = blur_score < BLUR_THRESHOLD
|
||||
is_low_contrast = contrast_score < CONTRAST_THRESHOLD
|
||||
is_degraded = is_blurry or is_low_contrast
|
||||
|
||||
# Overall quality percentage (simple weighted combination)
|
||||
blur_pct = min(100, blur_score / BLUR_THRESHOLD * 50)
|
||||
contrast_pct = min(100, contrast_score / CONTRAST_THRESHOLD * 50)
|
||||
quality_pct = int(min(100, blur_pct + contrast_pct))
|
||||
|
||||
# Recommended confidence threshold
|
||||
recommended_min_conf = CONFIDENCE_DEGRADED if is_degraded else CONFIDENCE_GOOD
|
||||
|
||||
report = ScanQualityReport(
|
||||
blur_score=round(blur_score, 1),
|
||||
contrast_score=round(contrast_score, 1),
|
||||
brightness=round(brightness, 1),
|
||||
is_blurry=is_blurry,
|
||||
is_low_contrast=is_low_contrast,
|
||||
is_degraded=is_degraded,
|
||||
quality_pct=quality_pct,
|
||||
recommended_min_conf=recommended_min_conf,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Scan quality: blur={report.blur_score} "
|
||||
f"contrast={report.contrast_score} "
|
||||
f"quality={report.quality_pct}% "
|
||||
f"degraded={report.is_degraded} "
|
||||
f"min_conf={report.recommended_min_conf}"
|
||||
)
|
||||
|
||||
return report
|
||||
@@ -1325,10 +1325,11 @@ async def process_single_page(
|
||||
|
||||
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||
rotation_deg = 0
|
||||
quality_report = None
|
||||
if OCR_PIPELINE_AVAILABLE:
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
|
||||
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
|
||||
img_bgr, page_number, session_id,
|
||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||
)
|
||||
@@ -1383,7 +1384,7 @@ async def process_single_page(
|
||||
session["vocabulary_count"] = len(existing_vocab)
|
||||
session["status"] = SessionStatus.EXTRACTED.value
|
||||
|
||||
return {
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": True,
|
||||
@@ -1394,6 +1395,14 @@ async def process_single_page(
|
||||
"rotation": rotation_deg,
|
||||
}
|
||||
|
||||
# Add scan quality report if available
|
||||
if quality_report:
|
||||
result["scan_quality"] = quality_report.to_dict()
|
||||
else:
|
||||
quality_report = None # ensure variable exists for non-pipeline path
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def _run_ocr_pipeline_for_page(
|
||||
img_bgr: np.ndarray,
|
||||
@@ -1471,6 +1480,26 @@ async def _run_ocr_pipeline_for_page(
|
||||
except Exception as e:
|
||||
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
|
||||
|
||||
# 5b. Scan quality assessment
|
||||
scan_quality_report = None
|
||||
try:
|
||||
from scan_quality import score_scan_quality
|
||||
scan_quality_report = score_scan_quality(dewarped_bgr)
|
||||
except Exception as e:
|
||||
logger.warning(f" scan quality: failed ({e})")
|
||||
|
||||
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
|
||||
|
||||
# 5c. Image enhancement for degraded scans
|
||||
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
|
||||
if is_degraded:
|
||||
try:
|
||||
from ocr_image_enhance import enhance_for_ocr
|
||||
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
|
||||
logger.info(" enhancement: applied (degraded scan)")
|
||||
except Exception as e:
|
||||
logger.warning(f" enhancement: failed ({e})")
|
||||
|
||||
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
|
||||
t0 = _time.time()
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
@@ -1498,7 +1527,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
text = str(data["text"][i]).strip()
|
||||
conf_raw = str(data["conf"][i])
|
||||
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||
if not text or conf < 20:
|
||||
if not text or conf < min_ocr_conf:
|
||||
continue
|
||||
tess_words.append({
|
||||
"text": text,
|
||||
@@ -1518,8 +1547,8 @@ async def _run_ocr_pipeline_for_page(
|
||||
else:
|
||||
merged_words = tess_words # fallback to Tesseract only
|
||||
|
||||
# Build initial grid from merged words
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||
# Build initial grid from merged words (limit to 3 columns for vocab tables)
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=3)
|
||||
for cell in cells:
|
||||
cell["ocr_engine"] = "rapid_kombi"
|
||||
|
||||
@@ -1743,7 +1772,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
||||
|
||||
return page_vocabulary, rotation
|
||||
return page_vocabulary, rotation, scan_quality_report
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/process-pages")
|
||||
|
||||
@@ -355,7 +355,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
}
|
||||
}
|
||||
|
||||
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string }> => {
|
||||
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string; scanQuality?: any }> => {
|
||||
const API_BASE = getApiBase()
|
||||
|
||||
try {
|
||||
@@ -377,7 +377,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
return { success: false, vocabulary: [], error: data.error || `Seite ${pageIndex + 1}: Unbekannter Fehler` }
|
||||
}
|
||||
|
||||
return { success: true, vocabulary: data.vocabulary || [] }
|
||||
return { success: true, vocabulary: data.vocabulary || [], scanQuality: data.scan_quality }
|
||||
} catch (e) {
|
||||
return { success: false, vocabulary: [], error: `Seite ${pageIndex + 1}: ${e instanceof Error ? e.message : 'Netzwerkfehler'}` }
|
||||
}
|
||||
@@ -413,7 +413,10 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
successful.push(pageIndex + 1)
|
||||
setSuccessfulPages([...successful])
|
||||
setVocabulary(prev => [...prev, ...result.vocabulary])
|
||||
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden`)
|
||||
const qualityInfo = result.scanQuality
|
||||
? ` | Qualitaet: ${result.scanQuality.quality_pct}%${result.scanQuality.is_degraded ? ' (degradiert!)' : ''}`
|
||||
: ''
|
||||
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden${qualityInfo}`)
|
||||
} else {
|
||||
failed.push(pageIndex + 1)
|
||||
setFailedPages([...failed])
|
||||
@@ -786,7 +789,9 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
|
||||
;(async () => {
|
||||
const allVocab: VocabularyEntry[] = []
|
||||
let lastQuality: any = null
|
||||
for (const pageIndex of pagesToReprocess) {
|
||||
setExtractionStatus(`Verarbeite Seite ${pageIndex + 1}...`)
|
||||
try {
|
||||
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session.id}/process-single-page/${pageIndex}?ipa_mode=${ipa}&syllable_mode=${syllable}`, {
|
||||
method: 'POST',
|
||||
@@ -796,12 +801,16 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
if (data.vocabulary) allVocab.push(...data.vocabulary)
|
||||
if (data.scan_quality) lastQuality = data.scan_quality
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
setVocabulary(allVocab)
|
||||
setIsExtracting(false)
|
||||
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen`)
|
||||
const qualityInfo = lastQuality
|
||||
? ` | Qualitaet: ${lastQuality.quality_pct}%${lastQuality.is_degraded ? ' (degradiert!)' : ''} | Blur: ${lastQuality.blur_score} | Kontrast: ${lastQuality.contrast_score}`
|
||||
: ''
|
||||
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen${qualityInfo}`)
|
||||
})()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user