Add oversized-stub filter for large page numbers/marks in grid rows
Rows with ≤2 words, total text ≤3 chars, and word height >1.8x median are removed as non-content elements (e.g. red page number "( 9"). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -976,6 +976,9 @@ async def build_grid(session_id: str):
|
|||||||
# 4b. Remove junk rows: rows where ALL cells contain only short,
|
# 4b. Remove junk rows: rows where ALL cells contain only short,
|
||||||
# low-confidence text (OCR noise, stray marks). Real vocabulary rows
|
# low-confidence text (OCR noise, stray marks). Real vocabulary rows
|
||||||
# have at least one word with conf >= 50 or meaningful text length.
|
# have at least one word with conf >= 50 or meaningful text length.
|
||||||
|
# Also remove "oversized stub" rows: rows with ≤2 very short words
|
||||||
|
# whose word-boxes are significantly taller than the median (e.g.
|
||||||
|
# large red page numbers like "( 9" that are not real text content).
|
||||||
_JUNK_CONF_THRESHOLD = 50
|
_JUNK_CONF_THRESHOLD = 50
|
||||||
_JUNK_MAX_TEXT_LEN = 3
|
_JUNK_MAX_TEXT_LEN = 3
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
@@ -983,25 +986,49 @@ async def build_grid(session_id: str):
|
|||||||
rows = z.get("rows", [])
|
rows = z.get("rows", [])
|
||||||
if not cells or not rows:
|
if not cells or not rows:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Compute median word height across the zone for oversized detection
|
||||||
|
all_wb_heights = [
|
||||||
|
wb["height"]
|
||||||
|
for cell in cells
|
||||||
|
for wb in cell.get("word_boxes") or []
|
||||||
|
if wb.get("height", 0) > 0
|
||||||
|
]
|
||||||
|
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
|
||||||
|
|
||||||
junk_row_indices = set()
|
junk_row_indices = set()
|
||||||
for row in rows:
|
for row in rows:
|
||||||
ri = row["index"]
|
ri = row["index"]
|
||||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||||
if not row_cells:
|
if not row_cells:
|
||||||
continue
|
continue
|
||||||
# Check if ALL word_boxes in ALL cells of this row are junk
|
|
||||||
|
row_wbs = [
|
||||||
|
wb for cell in row_cells
|
||||||
|
for wb in cell.get("word_boxes") or []
|
||||||
|
]
|
||||||
|
|
||||||
|
# Rule 1: ALL word_boxes are low-conf AND short text
|
||||||
all_junk = True
|
all_junk = True
|
||||||
for cell in row_cells:
|
for wb in row_wbs:
|
||||||
for wb in cell.get("word_boxes") or []:
|
text = (wb.get("text") or "").strip()
|
||||||
text = (wb.get("text") or "").strip()
|
conf = wb.get("conf", 0)
|
||||||
conf = wb.get("conf", 0)
|
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
||||||
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
all_junk = False
|
||||||
all_junk = False
|
|
||||||
break
|
|
||||||
if not all_junk:
|
|
||||||
break
|
break
|
||||||
if all_junk:
|
if all_junk and row_wbs:
|
||||||
junk_row_indices.add(ri)
|
junk_row_indices.add(ri)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Rule 2: oversized stub — ≤2 words, all short text (≤2 chars),
|
||||||
|
# and word height > 1.8× median (page numbers, stray marks)
|
||||||
|
if len(row_wbs) <= 2:
|
||||||
|
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
|
||||||
|
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
|
||||||
|
if len(total_text) <= 3 and max_h > median_wb_h * 1.8:
|
||||||
|
junk_row_indices.add(ri)
|
||||||
|
continue
|
||||||
|
|
||||||
if junk_row_indices:
|
if junk_row_indices:
|
||||||
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
||||||
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
||||||
|
|||||||
Reference in New Issue
Block a user