diff --git a/admin-lehrer/components/grid-editor/GridEditor.tsx b/admin-lehrer/components/grid-editor/GridEditor.tsx
index 0f1c928..e608e1c 100644
--- a/admin-lehrer/components/grid-editor/GridEditor.tsx
+++ b/admin-lehrer/components/grid-editor/GridEditor.tsx
@@ -174,6 +174,11 @@ export function GridEditor({ sessionId, onNext }: GridEditorProps) {
Woerterbuch ({Math.round(grid.dictionary_detection.confidence * 100)}%)
)}
+ {grid.page_number?.text && (
+
+ S. {grid.page_number.text}
+
+ )}
{grid.duration_seconds.toFixed(1)}s
diff --git a/admin-lehrer/components/grid-editor/types.ts b/admin-lehrer/components/grid-editor/types.ts
index a8392a6..e39ec8a 100644
--- a/admin-lehrer/components/grid-editor/types.ts
+++ b/admin-lehrer/components/grid-editor/types.ts
@@ -20,6 +20,13 @@ export interface DictionaryDetection {
headword_col_index: number | null
}
+/** Page number extracted from footer region of the scan. */
+export interface PageNumber {
+ text: string
+ y_pct: number
+ number?: number
+}
+
/** A complete structured grid with zones, ready for the Excel-like editor. */
export interface StructuredGrid {
session_id: string
@@ -31,6 +38,7 @@ export interface StructuredGrid {
formatting: GridFormatting
layout_metrics?: LayoutMetrics
dictionary_detection?: DictionaryDetection
+ page_number?: PageNumber | null
duration_seconds: number
edited?: boolean
layout_dividers?: LayoutDividers
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 1aa12a9..8708804 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -124,8 +124,9 @@ async def _build_grid_core(
# 2c. Filter footer rows (page numbers at the very bottom).
# Isolated short text in the bottom 5% of the page is typically a
- # page number ("64", "S. 12") and not real content.
- _filter_footer_words(all_words, img_h, logger, session_id)
+ # page number ("64", "S. 12") and not real content. The page number
+ # is extracted as metadata for the frontend header display.
+ page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
# 2c2. Filter OCR junk from header illustrations.
# Low-confidence short fragments above the first real content row.
@@ -1668,6 +1669,7 @@ async def _build_grid_core(
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
"syllables_applied": syllable_insertions > 0,
},
+ "page_number": page_number_info,
"duration_seconds": round(duration, 2),
}
diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py
index 6de3b3a..40c3b19 100644
--- a/klausur-service/backend/grid_editor_helpers.py
+++ b/klausur-service/backend/grid_editor_helpers.py
@@ -1301,29 +1301,42 @@ def _filter_footer_words(
img_h: int,
log: Any,
session_id: str,
-) -> None:
+) -> Optional[Dict]:
"""Remove isolated words in the bottom 5% of the page (page numbers).
- Modifies *words* in place.
+ Modifies *words* in place and returns a page_number metadata dict
+ if a page number was extracted, or None.
"""
if not words or img_h <= 0:
- return
+ return None
footer_y = img_h * 0.95
footer_words = [
w for w in words
if w["top"] + w.get("height", 0) / 2 > footer_y
]
if not footer_words:
- return
+ return None
# Only remove if footer has very few words (≤ 3) with short text
total_text = "".join((w.get("text") or "").strip() for w in footer_words)
if len(footer_words) <= 3 and len(total_text) <= 10:
+ # Extract page number metadata before removing
+ page_number_info = {
+ "text": total_text.strip(),
+ "y_pct": round(footer_words[0]["top"] / img_h * 100, 1),
+ }
+ # Try to parse as integer
+ digits = "".join(c for c in total_text if c.isdigit())
+ if digits:
+ page_number_info["number"] = int(digits)
+
footer_set = set(id(w) for w in footer_words)
words[:] = [w for w in words if id(w) not in footer_set]
log.info(
- "build-grid session %s: removed %d footer words ('%s')",
- session_id, len(footer_words), total_text,
+ "build-grid session %s: extracted page number '%s' and removed %d footer words",
+ session_id, total_text, len(footer_words),
)
+ return page_number_info
+ return None
def _filter_header_junk(