From e019dde01b53be379e515f5bcf6525d466900e89 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 26 Mar 2026 08:52:09 +0100 Subject: [PATCH] Extract page number as metadata instead of silently removing it _filter_footer_words now returns page number info (text, y_pct, number) instead of just removing footer words. The page number is included in the grid result as `page_number` and displayed in the frontend summary bar as "S. 233". This preserves page numbers for later page concatenation in the customer frontend while still removing them from the grid content. Co-Authored-By: Claude Opus 4.6 --- .../components/grid-editor/GridEditor.tsx | 5 ++++ admin-lehrer/components/grid-editor/types.ts | 8 ++++++ klausur-service/backend/grid_editor_api.py | 6 +++-- .../backend/grid_editor_helpers.py | 25 ++++++++++++++----- 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/admin-lehrer/components/grid-editor/GridEditor.tsx b/admin-lehrer/components/grid-editor/GridEditor.tsx index 0f1c928..e608e1c 100644 --- a/admin-lehrer/components/grid-editor/GridEditor.tsx +++ b/admin-lehrer/components/grid-editor/GridEditor.tsx @@ -174,6 +174,11 @@ export function GridEditor({ sessionId, onNext }: GridEditorProps) { Woerterbuch ({Math.round(grid.dictionary_detection.confidence * 100)}%) )} + {grid.page_number?.text && ( + + S. {grid.page_number.text} + + )} {grid.duration_seconds.toFixed(1)}s diff --git a/admin-lehrer/components/grid-editor/types.ts b/admin-lehrer/components/grid-editor/types.ts index a8392a6..e39ec8a 100644 --- a/admin-lehrer/components/grid-editor/types.ts +++ b/admin-lehrer/components/grid-editor/types.ts @@ -20,6 +20,13 @@ export interface DictionaryDetection { headword_col_index: number | null } +/** Page number extracted from footer region of the scan. */ +export interface PageNumber { + text: string + y_pct: number + number?: number +} + /** A complete structured grid with zones, ready for the Excel-like editor. */ export interface StructuredGrid { session_id: string @@ -31,6 +38,7 @@ export interface StructuredGrid { formatting: GridFormatting layout_metrics?: LayoutMetrics dictionary_detection?: DictionaryDetection + page_number?: PageNumber | null duration_seconds: number edited?: boolean layout_dividers?: LayoutDividers diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 1aa12a9..8708804 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -124,8 +124,9 @@ async def _build_grid_core( # 2c. Filter footer rows (page numbers at the very bottom). # Isolated short text in the bottom 5% of the page is typically a - # page number ("64", "S. 12") and not real content. - _filter_footer_words(all_words, img_h, logger, session_id) + # page number ("64", "S. 12") and not real content. The page number + # is extracted as metadata for the frontend header display. + page_number_info = _filter_footer_words(all_words, img_h, logger, session_id) # 2c2. Filter OCR junk from header illustrations. # Low-confidence short fragments above the first real content row. @@ -1668,6 +1669,7 @@ async def _build_grid_core( "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False, "syllables_applied": syllable_insertions > 0, }, + "page_number": page_number_info, "duration_seconds": round(duration, 2), } diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index 6de3b3a..40c3b19 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -1301,29 +1301,42 @@ def _filter_footer_words( img_h: int, log: Any, session_id: str, -) -> None: +) -> Optional[Dict]: """Remove isolated words in the bottom 5% of the page (page numbers). - Modifies *words* in place. + Modifies *words* in place and returns a page_number metadata dict + if a page number was extracted, or None. """ if not words or img_h <= 0: - return + return None footer_y = img_h * 0.95 footer_words = [ w for w in words if w["top"] + w.get("height", 0) / 2 > footer_y ] if not footer_words: - return + return None # Only remove if footer has very few words (≤ 3) with short text total_text = "".join((w.get("text") or "").strip() for w in footer_words) if len(footer_words) <= 3 and len(total_text) <= 10: + # Extract page number metadata before removing + page_number_info = { + "text": total_text.strip(), + "y_pct": round(footer_words[0]["top"] / img_h * 100, 1), + } + # Try to parse as integer + digits = "".join(c for c in total_text if c.isdigit()) + if digits: + page_number_info["number"] = int(digits) + footer_set = set(id(w) for w in footer_words) words[:] = [w for w in words if id(w) not in footer_set] log.info( - "build-grid session %s: removed %d footer words ('%s')", - session_id, len(footer_words), total_text, + "build-grid session %s: extracted page number '%s' and removed %d footer words", + session_id, total_text, len(footer_words), ) + return page_number_info + return None def _filter_header_junk(