feat: add border ghost filter + graphic detection tests + structure overlay
- Add _filter_border_ghost_words() to remove OCR artefacts from box borders (vertical + horizontal edge detection, column cleanup, re-indexing) - Add 20 tests for border ghost filter (basic filtering + column cleanup) - Add 24 tests for cv_graphic_detect (color detection, word overlap, boxes) - Clean up cv_graphic_detect.py logging (per-candidate → DEBUG) - Add structure overlay layer to StepReconstruction (boxes + graphics toggle) - Show border_ghosts_removed badge in StepStructureDetection - Update MkDocs with structure detection documentation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1202,6 +1202,147 @@ async def detect_type(session_id: str):
|
||||
return {"session_id": session_id, **result_dict}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Border-ghost word filter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Characters that OCR produces when reading box-border lines.
|
||||
_BORDER_GHOST_CHARS = set("|1lI![](){}iíì/\\-—–_~.,;:'\"")
|
||||
|
||||
|
||||
def _filter_border_ghost_words(
|
||||
word_result: Dict,
|
||||
boxes: List,
|
||||
) -> int:
|
||||
"""Remove OCR words that are actually box border lines.
|
||||
|
||||
A word is considered a border ghost when it sits on a known box edge
|
||||
(left, right, top, or bottom) and looks like a line artefact (narrow
|
||||
aspect ratio or text consists only of line-like characters).
|
||||
|
||||
After removing ghost cells, columns that have become empty are also
|
||||
removed from ``columns_used`` so the grid no longer shows phantom
|
||||
columns.
|
||||
|
||||
Modifies *word_result* in-place and returns the number of removed cells.
|
||||
"""
|
||||
if not boxes or not word_result:
|
||||
return 0
|
||||
|
||||
cells = word_result.get("cells")
|
||||
if not cells:
|
||||
return 0
|
||||
|
||||
# Build border bands — vertical (X) and horizontal (Y)
|
||||
x_bands = [] # list of (x_lo, x_hi)
|
||||
y_bands = [] # list of (y_lo, y_hi)
|
||||
for b in boxes:
|
||||
bx = b.x if hasattr(b, "x") else b.get("x", 0)
|
||||
by = b.y if hasattr(b, "y") else b.get("y", 0)
|
||||
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
|
||||
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
|
||||
bt = b.border_thickness if hasattr(b, "border_thickness") else b.get("border_thickness", 3)
|
||||
margin = max(bt * 2, 10) + 6 # generous margin
|
||||
|
||||
# Vertical edges (left / right)
|
||||
x_bands.append((bx - margin, bx + margin))
|
||||
x_bands.append((bx + bw - margin, bx + bw + margin))
|
||||
# Horizontal edges (top / bottom)
|
||||
y_bands.append((by - margin, by + margin))
|
||||
y_bands.append((by + bh - margin, by + bh + margin))
|
||||
|
||||
img_w = word_result.get("image_width", 1)
|
||||
img_h = word_result.get("image_height", 1)
|
||||
|
||||
def _is_ghost(cell: Dict) -> bool:
|
||||
text = (cell.get("text") or "").strip()
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# Compute absolute pixel position
|
||||
if cell.get("bbox_px"):
|
||||
px = cell["bbox_px"]
|
||||
cx = px["x"] + px["w"] / 2
|
||||
cy = px["y"] + px["h"] / 2
|
||||
cw = px["w"]
|
||||
ch = px["h"]
|
||||
elif cell.get("bbox_pct"):
|
||||
pct = cell["bbox_pct"]
|
||||
cx = (pct["x"] / 100) * img_w + (pct["w"] / 100) * img_w / 2
|
||||
cy = (pct["y"] / 100) * img_h + (pct["h"] / 100) * img_h / 2
|
||||
cw = (pct["w"] / 100) * img_w
|
||||
ch = (pct["h"] / 100) * img_h
|
||||
else:
|
||||
return False
|
||||
|
||||
# Check if center sits on a vertical or horizontal border
|
||||
on_vertical = any(lo <= cx <= hi for lo, hi in x_bands)
|
||||
on_horizontal = any(lo <= cy <= hi for lo, hi in y_bands)
|
||||
if not on_vertical and not on_horizontal:
|
||||
return False
|
||||
|
||||
# Very short text (1-2 chars) on a border → very likely ghost
|
||||
if len(text) <= 2:
|
||||
# Narrow vertically (line-like) or narrow horizontally (dash-like)?
|
||||
if ch > 0 and cw / ch < 0.5:
|
||||
return True
|
||||
if cw > 0 and ch / cw < 0.5:
|
||||
return True
|
||||
# Text is only border-ghost characters?
|
||||
if all(c in _BORDER_GHOST_CHARS for c in text):
|
||||
return True
|
||||
|
||||
# Longer text but still only ghost chars and very narrow
|
||||
if all(c in _BORDER_GHOST_CHARS for c in text):
|
||||
if ch > 0 and cw / ch < 0.35:
|
||||
return True
|
||||
if cw > 0 and ch / cw < 0.35:
|
||||
return True
|
||||
return True # all ghost chars on a border → remove
|
||||
|
||||
return False
|
||||
|
||||
before = len(cells)
|
||||
word_result["cells"] = [c for c in cells if not _is_ghost(c)]
|
||||
removed = before - len(word_result["cells"])
|
||||
|
||||
# --- Remove empty columns from columns_used ---
|
||||
columns_used = word_result.get("columns_used")
|
||||
if removed and columns_used and len(columns_used) > 1:
|
||||
remaining_cells = word_result["cells"]
|
||||
occupied_cols = {c.get("col_index") for c in remaining_cells}
|
||||
before_cols = len(columns_used)
|
||||
columns_used = [col for col in columns_used if col.get("index") in occupied_cols]
|
||||
|
||||
# Re-index columns and remap cell col_index values
|
||||
if len(columns_used) < before_cols:
|
||||
old_to_new = {}
|
||||
for new_i, col in enumerate(columns_used):
|
||||
old_to_new[col["index"]] = new_i
|
||||
col["index"] = new_i
|
||||
for cell in remaining_cells:
|
||||
old_ci = cell.get("col_index")
|
||||
if old_ci in old_to_new:
|
||||
cell["col_index"] = old_to_new[old_ci]
|
||||
word_result["columns_used"] = columns_used
|
||||
logger.info("border-ghost: removed %d empty column(s), %d remaining",
|
||||
before_cols - len(columns_used), len(columns_used))
|
||||
|
||||
if removed:
|
||||
# Update summary counts
|
||||
summary = word_result.get("summary", {})
|
||||
summary["total_cells"] = len(word_result["cells"])
|
||||
summary["non_empty_cells"] = sum(1 for c in word_result["cells"] if c.get("text"))
|
||||
word_result["summary"] = summary
|
||||
gs = word_result.get("grid_shape", {})
|
||||
gs["total_cells"] = len(word_result["cells"])
|
||||
if columns_used is not None:
|
||||
gs["cols"] = len(columns_used)
|
||||
word_result["grid_shape"] = gs
|
||||
|
||||
return removed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structure Detection Endpoint
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1236,10 +1377,6 @@ async def detect_structure(session_id: str):
|
||||
for cell in word_result["cells"]:
|
||||
for wb in (cell.get("word_boxes") or []):
|
||||
words.append(wb)
|
||||
logger.info("detect-structure: word_result present=%s, cells=%d, word_boxes extracted=%d",
|
||||
word_result is not None,
|
||||
len(word_result.get("cells", [])) if word_result else 0,
|
||||
len(words))
|
||||
# If no words yet, use image dimensions with small margin
|
||||
if words:
|
||||
content_x = max(0, min(int(wb["left"]) for wb in words))
|
||||
@@ -1319,6 +1456,15 @@ async def detect_structure(session_id: str):
|
||||
detected_boxes=box_dicts,
|
||||
)
|
||||
|
||||
# --- Filter border-ghost words from OCR result ---
|
||||
ghost_count = 0
|
||||
if boxes and word_result:
|
||||
ghost_count = _filter_border_ghost_words(word_result, boxes)
|
||||
if ghost_count:
|
||||
logger.info("detect-structure: removed %d border-ghost words", ghost_count)
|
||||
await update_session_db(session_id, word_result=word_result)
|
||||
cached["word_result"] = word_result
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
result_dict = {
|
||||
@@ -1361,6 +1507,7 @@ async def detect_structure(session_id: str):
|
||||
"color_pixel_counts": color_summary,
|
||||
"has_words": len(words) > 0,
|
||||
"word_count": len(words),
|
||||
"border_ghosts_removed": ghost_count,
|
||||
"duration_seconds": round(duration, 2),
|
||||
}
|
||||
|
||||
@@ -1806,12 +1953,7 @@ async def _get_structure_overlay(session_id: str) -> Response:
|
||||
# --- Draw graphic elements ---
|
||||
graphics_data = structure.get("graphics", [])
|
||||
shape_icons = {
|
||||
"arrow": "ARROW",
|
||||
"circle": "CIRCLE",
|
||||
"line": "LINE",
|
||||
"exclamation": "!",
|
||||
"dot": "DOT",
|
||||
"icon": "ICON",
|
||||
"image": "IMAGE",
|
||||
"illustration": "ILLUST",
|
||||
}
|
||||
for gfx in graphics_data:
|
||||
|
||||
Reference in New Issue
Block a user