Filter phantom rows from recovered color artifacts and low-conf OCR noise
- Apply recovered-artifact filter to ALL zones (was box-zones only) - Filter any recovered word with text ≤ 2 chars (not just !?•·) - Add post-grid junk-row removal: rows where all word_boxes have conf < 50 and text ≤ 3 chars are dropped as OCR noise Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -805,29 +805,27 @@ async def build_grid(session_id: str):
|
|||||||
# First pass: build grids per zone independently
|
# First pass: build grids per zone independently
|
||||||
zone_grids: List[Dict] = []
|
zone_grids: List[Dict] = []
|
||||||
|
|
||||||
_RECOVERED_NOISE = {"!", "?", "•", "·"}
|
|
||||||
|
|
||||||
for pz in page_zones:
|
for pz in page_zones:
|
||||||
zone_words = _words_in_zone(
|
zone_words = _words_in_zone(
|
||||||
all_words, pz.y, pz.height, pz.x, pz.width
|
all_words, pz.y, pz.height, pz.x, pz.width
|
||||||
)
|
)
|
||||||
# In box zones, filter out recovered single-char artifacts
|
# Filter recovered single-char artifacts in ALL zones
|
||||||
# (decorative elements like !, ?, • from color recovery)
|
# (decorative colored pixel blobs like !, ?, • from
|
||||||
if pz.zone_type == "box":
|
# recover_colored_text that don't represent real text)
|
||||||
before = len(zone_words)
|
before = len(zone_words)
|
||||||
zone_words = [
|
zone_words = [
|
||||||
w for w in zone_words
|
w for w in zone_words
|
||||||
if not (
|
if not (
|
||||||
w.get("recovered")
|
w.get("recovered")
|
||||||
and w.get("text", "").strip() in _RECOVERED_NOISE
|
and len(w.get("text", "").strip()) <= 2
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
removed = before - len(zone_words)
|
removed = before - len(zone_words)
|
||||||
if removed:
|
if removed:
|
||||||
logger.info(
|
logger.info(
|
||||||
"build-grid: filtered %d recovered artifacts from box zone %d",
|
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||||
removed, pz.index,
|
removed, pz.zone_type, pz.index,
|
||||||
)
|
)
|
||||||
grid = _build_zone_grid(
|
grid = _build_zone_grid(
|
||||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||||
pz.index, img_w, img_h,
|
pz.index, img_w, img_h,
|
||||||
@@ -940,8 +938,20 @@ async def build_grid(session_id: str):
|
|||||||
|
|
||||||
# 4. Fallback: no boxes detected → single zone with all words
|
# 4. Fallback: no boxes detected → single zone with all words
|
||||||
if not zones_data:
|
if not zones_data:
|
||||||
|
# Filter recovered single-char artifacts (same as in zone loop above)
|
||||||
|
before = len(all_words)
|
||||||
|
filtered_words = [
|
||||||
|
w for w in all_words
|
||||||
|
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
|
||||||
|
]
|
||||||
|
removed = before - len(filtered_words)
|
||||||
|
if removed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
||||||
|
session_id, removed,
|
||||||
|
)
|
||||||
grid = _build_zone_grid(
|
grid = _build_zone_grid(
|
||||||
all_words, content_x, content_y, content_w, content_h,
|
filtered_words, content_x, content_y, content_w, content_h,
|
||||||
0, img_w, img_h,
|
0, img_w, img_h,
|
||||||
)
|
)
|
||||||
grid.pop("_raw_columns", None)
|
grid.pop("_raw_columns", None)
|
||||||
@@ -963,6 +973,44 @@ async def build_grid(session_id: str):
|
|||||||
**grid,
|
**grid,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# 4b. Remove junk rows: rows where ALL cells contain only short,
|
||||||
|
# low-confidence text (OCR noise, stray marks). Real vocabulary rows
|
||||||
|
# have at least one word with conf >= 50 or meaningful text length.
|
||||||
|
_JUNK_CONF_THRESHOLD = 50
|
||||||
|
_JUNK_MAX_TEXT_LEN = 3
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
rows = z.get("rows", [])
|
||||||
|
if not cells or not rows:
|
||||||
|
continue
|
||||||
|
junk_row_indices = set()
|
||||||
|
for row in rows:
|
||||||
|
ri = row["index"]
|
||||||
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||||
|
if not row_cells:
|
||||||
|
continue
|
||||||
|
# Check if ALL word_boxes in ALL cells of this row are junk
|
||||||
|
all_junk = True
|
||||||
|
for cell in row_cells:
|
||||||
|
for wb in cell.get("word_boxes") or []:
|
||||||
|
text = (wb.get("text") or "").strip()
|
||||||
|
conf = wb.get("conf", 0)
|
||||||
|
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
||||||
|
all_junk = False
|
||||||
|
break
|
||||||
|
if not all_junk:
|
||||||
|
break
|
||||||
|
if all_junk:
|
||||||
|
junk_row_indices.add(ri)
|
||||||
|
if junk_row_indices:
|
||||||
|
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
||||||
|
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: removed %d junk rows from zone %d: %s",
|
||||||
|
len(junk_row_indices), z["zone_index"],
|
||||||
|
sorted(junk_row_indices),
|
||||||
|
)
|
||||||
|
|
||||||
# 5. Color annotation on final word_boxes in cells
|
# 5. Color annotation on final word_boxes in cells
|
||||||
if img_bgr is not None:
|
if img_bgr is not None:
|
||||||
all_wb: List[Dict] = []
|
all_wb: List[Dict] = []
|
||||||
|
|||||||
Reference in New Issue
Block a user