Filter phantom rows from recovered color artifacts and low-conf OCR noise
- Apply recovered-artifact filter to ALL zones (was box-zones only) - Filter any recovered word with text ≤ 2 chars (not just !?•·) - Add post-grid junk-row removal: rows where all word_boxes have conf < 50 and text ≤ 3 chars are dropped as OCR noise Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -805,29 +805,27 @@ async def build_grid(session_id: str):
|
||||
# First pass: build grids per zone independently
|
||||
zone_grids: List[Dict] = []
|
||||
|
||||
_RECOVERED_NOISE = {"!", "?", "•", "·"}
|
||||
|
||||
for pz in page_zones:
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
# In box zones, filter out recovered single-char artifacts
|
||||
# (decorative elements like !, ?, • from color recovery)
|
||||
if pz.zone_type == "box":
|
||||
before = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not (
|
||||
w.get("recovered")
|
||||
and w.get("text", "").strip() in _RECOVERED_NOISE
|
||||
)
|
||||
]
|
||||
removed = before - len(zone_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d recovered artifacts from box zone %d",
|
||||
removed, pz.index,
|
||||
)
|
||||
# Filter recovered single-char artifacts in ALL zones
|
||||
# (decorative colored pixel blobs like !, ?, • from
|
||||
# recover_colored_text that don't represent real text)
|
||||
before = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not (
|
||||
w.get("recovered")
|
||||
and len(w.get("text", "").strip()) <= 2
|
||||
)
|
||||
]
|
||||
removed = before - len(zone_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||
removed, pz.zone_type, pz.index,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
@@ -940,8 +938,20 @@ async def build_grid(session_id: str):
|
||||
|
||||
# 4. Fallback: no boxes detected → single zone with all words
|
||||
if not zones_data:
|
||||
# Filter recovered single-char artifacts (same as in zone loop above)
|
||||
before = len(all_words)
|
||||
filtered_words = [
|
||||
w for w in all_words
|
||||
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
|
||||
]
|
||||
removed = before - len(filtered_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
||||
session_id, removed,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
all_words, content_x, content_y, content_w, content_h,
|
||||
filtered_words, content_x, content_y, content_w, content_h,
|
||||
0, img_w, img_h,
|
||||
)
|
||||
grid.pop("_raw_columns", None)
|
||||
@@ -963,6 +973,44 @@ async def build_grid(session_id: str):
|
||||
**grid,
|
||||
})
|
||||
|
||||
# 4b. Remove junk rows: rows where ALL cells contain only short,
|
||||
# low-confidence text (OCR noise, stray marks). Real vocabulary rows
|
||||
# have at least one word with conf >= 50 or meaningful text length.
|
||||
_JUNK_CONF_THRESHOLD = 50
|
||||
_JUNK_MAX_TEXT_LEN = 3
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
rows = z.get("rows", [])
|
||||
if not cells or not rows:
|
||||
continue
|
||||
junk_row_indices = set()
|
||||
for row in rows:
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||
if not row_cells:
|
||||
continue
|
||||
# Check if ALL word_boxes in ALL cells of this row are junk
|
||||
all_junk = True
|
||||
for cell in row_cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
text = (wb.get("text") or "").strip()
|
||||
conf = wb.get("conf", 0)
|
||||
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
||||
all_junk = False
|
||||
break
|
||||
if not all_junk:
|
||||
break
|
||||
if all_junk:
|
||||
junk_row_indices.add(ri)
|
||||
if junk_row_indices:
|
||||
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
||||
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
||||
logger.info(
|
||||
"build-grid: removed %d junk rows from zone %d: %s",
|
||||
len(junk_row_indices), z["zone_index"],
|
||||
sorted(junk_row_indices),
|
||||
)
|
||||
|
||||
# 5. Color annotation on final word_boxes in cells
|
||||
if img_bgr is not None:
|
||||
all_wb: List[Dict] = []
|
||||
|
||||
Reference in New Issue
Block a user