diff --git a/admin-lehrer/components/grid-editor/GridEditor.tsx b/admin-lehrer/components/grid-editor/GridEditor.tsx index 0f1c928..e608e1c 100644 --- a/admin-lehrer/components/grid-editor/GridEditor.tsx +++ b/admin-lehrer/components/grid-editor/GridEditor.tsx @@ -174,6 +174,11 @@ export function GridEditor({ sessionId, onNext }: GridEditorProps) { Woerterbuch ({Math.round(grid.dictionary_detection.confidence * 100)}%) )} + {grid.page_number?.text && ( + + S. {grid.page_number.text} + + )} {grid.duration_seconds.toFixed(1)}s diff --git a/admin-lehrer/components/grid-editor/types.ts b/admin-lehrer/components/grid-editor/types.ts index a8392a6..e39ec8a 100644 --- a/admin-lehrer/components/grid-editor/types.ts +++ b/admin-lehrer/components/grid-editor/types.ts @@ -20,6 +20,13 @@ export interface DictionaryDetection { headword_col_index: number | null } +/** Page number extracted from footer region of the scan. */ +export interface PageNumber { + text: string + y_pct: number + number?: number +} + /** A complete structured grid with zones, ready for the Excel-like editor. */ export interface StructuredGrid { session_id: string @@ -31,6 +38,7 @@ export interface StructuredGrid { formatting: GridFormatting layout_metrics?: LayoutMetrics dictionary_detection?: DictionaryDetection + page_number?: PageNumber | null duration_seconds: number edited?: boolean layout_dividers?: LayoutDividers diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 1aa12a9..8708804 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -124,8 +124,9 @@ async def _build_grid_core( # 2c. Filter footer rows (page numbers at the very bottom). # Isolated short text in the bottom 5% of the page is typically a - # page number ("64", "S. 12") and not real content. - _filter_footer_words(all_words, img_h, logger, session_id) + # page number ("64", "S. 12") and not real content. The page number + # is extracted as metadata for the frontend header display. + page_number_info = _filter_footer_words(all_words, img_h, logger, session_id) # 2c2. Filter OCR junk from header illustrations. # Low-confidence short fragments above the first real content row. @@ -1668,6 +1669,7 @@ async def _build_grid_core( "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False, "syllables_applied": syllable_insertions > 0, }, + "page_number": page_number_info, "duration_seconds": round(duration, 2), } diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index 6de3b3a..40c3b19 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -1301,29 +1301,42 @@ def _filter_footer_words( img_h: int, log: Any, session_id: str, -) -> None: +) -> Optional[Dict]: """Remove isolated words in the bottom 5% of the page (page numbers). - Modifies *words* in place. + Modifies *words* in place and returns a page_number metadata dict + if a page number was extracted, or None. """ if not words or img_h <= 0: - return + return None footer_y = img_h * 0.95 footer_words = [ w for w in words if w["top"] + w.get("height", 0) / 2 > footer_y ] if not footer_words: - return + return None # Only remove if footer has very few words (≤ 3) with short text total_text = "".join((w.get("text") or "").strip() for w in footer_words) if len(footer_words) <= 3 and len(total_text) <= 10: + # Extract page number metadata before removing + page_number_info = { + "text": total_text.strip(), + "y_pct": round(footer_words[0]["top"] / img_h * 100, 1), + } + # Try to parse as integer + digits = "".join(c for c in total_text if c.isdigit()) + if digits: + page_number_info["number"] = int(digits) + footer_set = set(id(w) for w in footer_words) words[:] = [w for w in words if id(w) not in footer_set] log.info( - "build-grid session %s: removed %d footer words ('%s')", - session_id, len(footer_words), total_text, + "build-grid session %s: extracted page number '%s' and removed %d footer words", + session_id, total_text, len(footer_words), ) + return page_number_info + return None def _filter_header_junk(