Filter false-positive boxes in header/footer margins
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 55s
CI / test-go-edu-search (push) Successful in 1m0s
CI / test-python-klausur (push) Failing after 2m35s
CI / test-python-agent-core (push) Successful in 27s
CI / test-nodejs-website (push) Successful in 27s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 55s
CI / test-go-edu-search (push) Successful in 1m0s
CI / test-python-klausur (push) Failing after 2m35s
CI / test-python-agent-core (push) Successful in 27s
CI / test-nodejs-website (push) Successful in 27s
Boxes whose vertical center falls within top/bottom 7% of image height are filtered out (page numbers, unit headers, running footers). At typical scan resolutions, 7% ≈ 2.5cm margin. Fixes: "Box 1" containing just "3" from "Unit 3" page header being incorrectly treated as an embedded box. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2221,6 +2221,28 @@ async def build_box_grids(session_id: str, request: Request):
|
|||||||
if not detected_boxes:
|
if not detected_boxes:
|
||||||
return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
|
return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"}
|
||||||
|
|
||||||
|
# Filter out false-positive boxes in header/footer margins.
|
||||||
|
# Textbook pages have ~2.5cm margins at top/bottom. At typical scan
|
||||||
|
# resolutions (150-300 DPI), that's roughly 5-10% of image height.
|
||||||
|
# A box whose vertical CENTER falls within the top or bottom 7% of
|
||||||
|
# the image is likely a page number, unit header, or running footer.
|
||||||
|
img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0)
|
||||||
|
if img_h_for_filter > 0:
|
||||||
|
margin_frac = 0.07 # 7% of image height
|
||||||
|
margin_top = img_h_for_filter * margin_frac
|
||||||
|
margin_bottom = img_h_for_filter * (1 - margin_frac)
|
||||||
|
filtered = []
|
||||||
|
for box in detected_boxes:
|
||||||
|
by = box.get("y", 0)
|
||||||
|
bh = box.get("h", 0)
|
||||||
|
box_center_y = by + bh / 2
|
||||||
|
if box_center_y < margin_top or box_center_y > margin_bottom:
|
||||||
|
logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)",
|
||||||
|
by, bh, box_center_y, margin_top, margin_bottom)
|
||||||
|
continue
|
||||||
|
filtered.append(box)
|
||||||
|
detected_boxes = filtered
|
||||||
|
|
||||||
body = {}
|
body = {}
|
||||||
try:
|
try:
|
||||||
body = await request.json()
|
body = await request.json()
|
||||||
|
|||||||
Reference in New Issue
Block a user