diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 87ce37e..df9e723 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2221,6 +2221,28 @@ async def build_box_grids(session_id: str, request: Request): if not detected_boxes: return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"} + # Filter out false-positive boxes in header/footer margins. + # Textbook pages have ~2.5cm margins at top/bottom. At typical scan + # resolutions (150-300 DPI), that's roughly 5-10% of image height. + # A box whose vertical CENTER falls within the top or bottom 7% of + # the image is likely a page number, unit header, or running footer. + img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0) + if img_h_for_filter > 0: + margin_frac = 0.07 # 7% of image height + margin_top = img_h_for_filter * margin_frac + margin_bottom = img_h_for_filter * (1 - margin_frac) + filtered = [] + for box in detected_boxes: + by = box.get("y", 0) + bh = box.get("h", 0) + box_center_y = by + bh / 2 + if box_center_y < margin_top or box_center_y > margin_bottom: + logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)", + by, bh, box_center_y, margin_top, margin_bottom) + continue + filtered.append(box) + detected_boxes = filtered + body = {} try: body = await request.json()