Fix: reprocess button works after session resume + apply merge logic
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 46s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 34s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 46s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 34s
Two bugs fixed: 1. reprocessPages() failed silently after session resume because successfulPages was empty. Now derives pages from vocabulary source_page or selectedPages as fallback. 2. process-single-page endpoint built vocabulary entries WITHOUT applying merge logic (_merge_wrapped_rows, _merge_continuation_rows). Now applies full merge pipeline after vocabulary extraction. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -77,6 +77,11 @@ try:
|
|||||||
render_pdf_high_res,
|
render_pdf_high_res,
|
||||||
PageRegion, RowGeometry,
|
PageRegion, RowGeometry,
|
||||||
)
|
)
|
||||||
|
from cv_cell_grid import (
|
||||||
|
_merge_wrapped_rows,
|
||||||
|
_merge_phonetic_continuation_rows,
|
||||||
|
_merge_continuation_rows,
|
||||||
|
)
|
||||||
from ocr_pipeline_session_store import (
|
from ocr_pipeline_session_store import (
|
||||||
create_session_db as create_pipeline_session_db,
|
create_session_db as create_pipeline_session_db,
|
||||||
update_session_db as update_pipeline_session_db,
|
update_session_db as update_pipeline_session_db,
|
||||||
@@ -1696,6 +1701,42 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
})
|
})
|
||||||
extraction_source = f"generic ({len(all_ci)} cols)"
|
extraction_source = f"generic ({len(all_ci)} cols)"
|
||||||
|
|
||||||
|
# --- Post-processing: merge cell-wrap continuation rows ---
|
||||||
|
if len(page_vocabulary) >= 2:
|
||||||
|
try:
|
||||||
|
# Convert to internal format (example_sentence → example)
|
||||||
|
internal = []
|
||||||
|
for v in page_vocabulary:
|
||||||
|
internal.append({
|
||||||
|
'row_index': len(internal),
|
||||||
|
'english': v.get('english', ''),
|
||||||
|
'german': v.get('german', ''),
|
||||||
|
'example': v.get('example_sentence', ''),
|
||||||
|
})
|
||||||
|
|
||||||
|
n_before = len(internal)
|
||||||
|
internal = _merge_wrapped_rows(internal)
|
||||||
|
internal = _merge_phonetic_continuation_rows(internal)
|
||||||
|
internal = _merge_continuation_rows(internal)
|
||||||
|
|
||||||
|
if len(internal) < n_before:
|
||||||
|
# Rebuild page_vocabulary from merged entries
|
||||||
|
merged_vocab = []
|
||||||
|
for entry in internal:
|
||||||
|
if not entry.get('english') and not entry.get('german'):
|
||||||
|
continue
|
||||||
|
merged_vocab.append({
|
||||||
|
'id': str(uuid.uuid4()),
|
||||||
|
'english': entry.get('english', ''),
|
||||||
|
'german': entry.get('german', ''),
|
||||||
|
'example_sentence': entry.get('example', ''),
|
||||||
|
'source_page': page_number + 1,
|
||||||
|
})
|
||||||
|
logger.info(f" row merging: {n_before} → {len(merged_vocab)} entries")
|
||||||
|
page_vocabulary = merged_vocab
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" row merging failed (non-critical): {e}")
|
||||||
|
|
||||||
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
|
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
|
||||||
|
|
||||||
total_duration = _time.time() - t_total
|
total_duration = _time.time() - t_total
|
||||||
|
|||||||
@@ -760,11 +760,28 @@ export function useVocabWorksheet(): VocabWorksheetHook {
|
|||||||
|
|
||||||
// Reprocess all successful pages with new IPA/syllable modes
|
// Reprocess all successful pages with new IPA/syllable modes
|
||||||
const reprocessPages = (ipa: IpaMode, syllable: SyllableMode) => {
|
const reprocessPages = (ipa: IpaMode, syllable: SyllableMode) => {
|
||||||
if (!session || successfulPages.length === 0) return
|
if (!session) return
|
||||||
|
|
||||||
|
// Determine pages to reprocess: use successfulPages if available,
|
||||||
|
// otherwise derive from vocabulary source_page or selectedPages
|
||||||
|
let pagesToReprocess: number[]
|
||||||
|
if (successfulPages.length > 0) {
|
||||||
|
pagesToReprocess = successfulPages.map(p => p - 1)
|
||||||
|
} else if (vocabulary.length > 0) {
|
||||||
|
// Derive from vocabulary entries' source_page (1-indexed → 0-indexed)
|
||||||
|
const pageSet = new Set(vocabulary.map(v => (v.source_page || 1) - 1))
|
||||||
|
pagesToReprocess = [...pageSet].sort((a, b) => a - b)
|
||||||
|
} else if (selectedPages.length > 0) {
|
||||||
|
pagesToReprocess = [...selectedPages]
|
||||||
|
} else {
|
||||||
|
// Fallback: try page 0
|
||||||
|
pagesToReprocess = [0]
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pagesToReprocess.length === 0) return
|
||||||
|
|
||||||
setIsExtracting(true)
|
setIsExtracting(true)
|
||||||
setExtractionStatus('Verarbeite mit neuen Einstellungen...')
|
setExtractionStatus('Verarbeite mit neuen Einstellungen...')
|
||||||
const pagesToReprocess = successfulPages.map(p => p - 1)
|
|
||||||
const API_BASE = getApiBase()
|
const API_BASE = getApiBase()
|
||||||
|
|
||||||
;(async () => {
|
;(async () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user