Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s

Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts
OCR confidence threshold (40 for good scans, 30 for degraded).
Quality report included in API response + shown in frontend.

Step 2: max_columns parameter in cv_words_first.py — limits column
detection to 3 for vocab tables, preventing phantom columns D/E
from degraded OCR fragments.

Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter
denoising + unsharp mask, only for degraded scans (gated by
quality score). Pattern from handwriting_htr_api.py.

Frontend: quality info shown in extraction status after processing.
Reprocess button now derives pages from vocabulary data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-23 14:58:39 +02:00
parent 5a154b744d
commit 2f34ee9ede
5 changed files with 267 additions and 16 deletions

View File

@@ -355,7 +355,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
}
}
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string }> => {
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string; scanQuality?: any }> => {
const API_BASE = getApiBase()
try {
@@ -377,7 +377,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
return { success: false, vocabulary: [], error: data.error || `Seite ${pageIndex + 1}: Unbekannter Fehler` }
}
return { success: true, vocabulary: data.vocabulary || [] }
return { success: true, vocabulary: data.vocabulary || [], scanQuality: data.scan_quality }
} catch (e) {
return { success: false, vocabulary: [], error: `Seite ${pageIndex + 1}: ${e instanceof Error ? e.message : 'Netzwerkfehler'}` }
}
@@ -413,7 +413,10 @@ export function useVocabWorksheet(): VocabWorksheetHook {
successful.push(pageIndex + 1)
setSuccessfulPages([...successful])
setVocabulary(prev => [...prev, ...result.vocabulary])
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden`)
const qualityInfo = result.scanQuality
? ` | Qualitaet: ${result.scanQuality.quality_pct}%${result.scanQuality.is_degraded ? ' (degradiert!)' : ''}`
: ''
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden${qualityInfo}`)
} else {
failed.push(pageIndex + 1)
setFailedPages([...failed])
@@ -786,7 +789,9 @@ export function useVocabWorksheet(): VocabWorksheetHook {
;(async () => {
const allVocab: VocabularyEntry[] = []
let lastQuality: any = null
for (const pageIndex of pagesToReprocess) {
setExtractionStatus(`Verarbeite Seite ${pageIndex + 1}...`)
try {
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session.id}/process-single-page/${pageIndex}?ipa_mode=${ipa}&syllable_mode=${syllable}`, {
method: 'POST',
@@ -796,12 +801,16 @@ export function useVocabWorksheet(): VocabWorksheetHook {
if (res.ok) {
const data = await res.json()
if (data.vocabulary) allVocab.push(...data.vocabulary)
if (data.scan_quality) lastQuality = data.scan_quality
}
} catch {}
}
setVocabulary(allVocab)
setIsExtracting(false)
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen`)
const qualityInfo = lastQuality
? ` | Qualitaet: ${lastQuality.quality_pct}%${lastQuality.is_degraded ? ' (degradiert!)' : ''} | Blur: ${lastQuality.blur_score} | Kontrast: ${lastQuality.contrast_score}`
: ''
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen${qualityInfo}`)
})()
}