From baac98f837f5efe7ef9de3c14a90367630cddd83 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 13 Apr 2026 14:38:53 +0200 Subject: [PATCH] Filter false-positive boxes in header/footer margins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boxes whose vertical center falls within top/bottom 7% of image height are filtered out (page numbers, unit headers, running footers). At typical scan resolutions, 7% ≈ 2.5cm margin. Fixes: "Box 1" containing just "3" from "Unit 3" page header being incorrectly treated as an embedded box. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/grid_editor_api.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 87ce37e..df9e723 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2221,6 +2221,28 @@ async def build_box_grids(session_id: str, request: Request): if not detected_boxes: return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"} + # Filter out false-positive boxes in header/footer margins. + # Textbook pages have ~2.5cm margins at top/bottom. At typical scan + # resolutions (150-300 DPI), that's roughly 5-10% of image height. + # A box whose vertical CENTER falls within the top or bottom 7% of + # the image is likely a page number, unit header, or running footer. + img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0) + if img_h_for_filter > 0: + margin_frac = 0.07 # 7% of image height + margin_top = img_h_for_filter * margin_frac + margin_bottom = img_h_for_filter * (1 - margin_frac) + filtered = [] + for box in detected_boxes: + by = box.get("y", 0) + bh = box.get("h", 0) + box_center_y = by + bh / 2 + if box_center_y < margin_top or box_center_y > margin_bottom: + logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)", + by, bh, box_center_y, margin_top, margin_bottom) + continue + filtered.append(box) + detected_boxes = filtered + body = {} try: body = await request.json()