Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -355,7 +355,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
}
|
||||
}
|
||||
|
||||
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string }> => {
|
||||
const processSinglePage = async (pageIndex: number, ipa: IpaMode, syllable: SyllableMode): Promise<{ success: boolean; vocabulary: VocabularyEntry[]; error?: string; scanQuality?: any }> => {
|
||||
const API_BASE = getApiBase()
|
||||
|
||||
try {
|
||||
@@ -377,7 +377,7 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
return { success: false, vocabulary: [], error: data.error || `Seite ${pageIndex + 1}: Unbekannter Fehler` }
|
||||
}
|
||||
|
||||
return { success: true, vocabulary: data.vocabulary || [] }
|
||||
return { success: true, vocabulary: data.vocabulary || [], scanQuality: data.scan_quality }
|
||||
} catch (e) {
|
||||
return { success: false, vocabulary: [], error: `Seite ${pageIndex + 1}: ${e instanceof Error ? e.message : 'Netzwerkfehler'}` }
|
||||
}
|
||||
@@ -413,7 +413,10 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
successful.push(pageIndex + 1)
|
||||
setSuccessfulPages([...successful])
|
||||
setVocabulary(prev => [...prev, ...result.vocabulary])
|
||||
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden`)
|
||||
const qualityInfo = result.scanQuality
|
||||
? ` | Qualitaet: ${result.scanQuality.quality_pct}%${result.scanQuality.is_degraded ? ' (degradiert!)' : ''}`
|
||||
: ''
|
||||
setExtractionStatus(`Seite ${pageIndex + 1} fertig: ${result.vocabulary.length} Vokabeln gefunden${qualityInfo}`)
|
||||
} else {
|
||||
failed.push(pageIndex + 1)
|
||||
setFailedPages([...failed])
|
||||
@@ -786,7 +789,9 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
|
||||
;(async () => {
|
||||
const allVocab: VocabularyEntry[] = []
|
||||
let lastQuality: any = null
|
||||
for (const pageIndex of pagesToReprocess) {
|
||||
setExtractionStatus(`Verarbeite Seite ${pageIndex + 1}...`)
|
||||
try {
|
||||
const res = await fetch(`${API_BASE}/api/v1/vocab/sessions/${session.id}/process-single-page/${pageIndex}?ipa_mode=${ipa}&syllable_mode=${syllable}`, {
|
||||
method: 'POST',
|
||||
@@ -796,12 +801,16 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
if (data.vocabulary) allVocab.push(...data.vocabulary)
|
||||
if (data.scan_quality) lastQuality = data.scan_quality
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
setVocabulary(allVocab)
|
||||
setIsExtracting(false)
|
||||
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen`)
|
||||
const qualityInfo = lastQuality
|
||||
? ` | Qualitaet: ${lastQuality.quality_pct}%${lastQuality.is_degraded ? ' (degradiert!)' : ''} | Blur: ${lastQuality.blur_score} | Kontrast: ${lastQuality.contrast_score}`
|
||||
: ''
|
||||
setExtractionStatus(`${allVocab.length} Vokabeln mit neuen Einstellungen${qualityInfo}`)
|
||||
})()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user