Filter phantom rows from recovered color artifacts and low-conf OCR noise

- Apply recovered-artifact filter to ALL zones (was box-zones only)
- Filter any recovered word with text ≤ 2 chars (not just !?•·)
- Add post-grid junk-row removal: rows where all word_boxes have
  conf < 50 and text ≤ 3 chars are dropped as OCR noise

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-18 09:00:43 +01:00
parent 143e41ec76
commit f9bad7beaa

View File

@@ -805,29 +805,27 @@ async def build_grid(session_id: str):
# First pass: build grids per zone independently
zone_grids: List[Dict] = []
_RECOVERED_NOISE = {"!", "?", "", "·"}
for pz in page_zones:
zone_words = _words_in_zone(
all_words, pz.y, pz.height, pz.x, pz.width
)
# In box zones, filter out recovered single-char artifacts
# (decorative elements like !, ?, • from color recovery)
if pz.zone_type == "box":
before = len(zone_words)
zone_words = [
w for w in zone_words
if not (
w.get("recovered")
and w.get("text", "").strip() in _RECOVERED_NOISE
)
]
removed = before - len(zone_words)
if removed:
logger.info(
"build-grid: filtered %d recovered artifacts from box zone %d",
removed, pz.index,
)
# Filter recovered single-char artifacts in ALL zones
# (decorative colored pixel blobs like !, ?, • from
# recover_colored_text that don't represent real text)
before = len(zone_words)
zone_words = [
w for w in zone_words
if not (
w.get("recovered")
and len(w.get("text", "").strip()) <= 2
)
]
removed = before - len(zone_words)
if removed:
logger.info(
"build-grid: filtered %d recovered artifacts from %s zone %d",
removed, pz.zone_type, pz.index,
)
grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
@@ -940,8 +938,20 @@ async def build_grid(session_id: str):
# 4. Fallback: no boxes detected → single zone with all words
if not zones_data:
# Filter recovered single-char artifacts (same as in zone loop above)
before = len(all_words)
filtered_words = [
w for w in all_words
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
]
removed = before - len(filtered_words)
if removed:
logger.info(
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
session_id, removed,
)
grid = _build_zone_grid(
all_words, content_x, content_y, content_w, content_h,
filtered_words, content_x, content_y, content_w, content_h,
0, img_w, img_h,
)
grid.pop("_raw_columns", None)
@@ -963,6 +973,44 @@ async def build_grid(session_id: str):
**grid,
})
# 4b. Remove junk rows: rows where ALL cells contain only short,
# low-confidence text (OCR noise, stray marks). Real vocabulary rows
# have at least one word with conf >= 50 or meaningful text length.
_JUNK_CONF_THRESHOLD = 50
_JUNK_MAX_TEXT_LEN = 3
for z in zones_data:
cells = z.get("cells", [])
rows = z.get("rows", [])
if not cells or not rows:
continue
junk_row_indices = set()
for row in rows:
ri = row["index"]
row_cells = [c for c in cells if c.get("row_index") == ri]
if not row_cells:
continue
# Check if ALL word_boxes in ALL cells of this row are junk
all_junk = True
for cell in row_cells:
for wb in cell.get("word_boxes") or []:
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 0)
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
all_junk = False
break
if not all_junk:
break
if all_junk:
junk_row_indices.add(ri)
if junk_row_indices:
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
logger.info(
"build-grid: removed %d junk rows from zone %d: %s",
len(junk_row_indices), z["zone_index"],
sorted(junk_row_indices),
)
# 5. Color annotation on final word_boxes in cells
if img_bgr is not None:
all_wb: List[Dict] = []