feat: add border ghost filter + graphic detection tests + structure overlay
- Add _filter_border_ghost_words() to remove OCR artefacts from box borders (vertical + horizontal edge detection, column cleanup, re-indexing) - Add 20 tests for border ghost filter (basic filtering + column cleanup) - Add 24 tests for cv_graphic_detect (color detection, word overlap, boxes) - Clean up cv_graphic_detect.py logging (per-candidate → DEBUG) - Add structure overlay layer to StepReconstruction (boxes + graphics toggle) - Show border_ghosts_removed badge in StepStructureDetection - Update MkDocs with structure detection documentation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -121,10 +121,9 @@ def detect_graphic_elements(
|
||||
return []
|
||||
|
||||
h, w = img_bgr.shape[:2]
|
||||
img_area = h * w
|
||||
|
||||
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
||||
w, h, len(word_boxes), len(detected_boxes or []))
|
||||
logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
||||
w, h, len(word_boxes), len(detected_boxes or []))
|
||||
|
||||
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||||
candidates: List[GraphicElement] = []
|
||||
@@ -161,7 +160,7 @@ def detect_graphic_elements(
|
||||
contours_regions, _ = cv2.findContours(
|
||||
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
|
||||
logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
|
||||
|
||||
for cnt in contours_regions:
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
@@ -172,7 +171,7 @@ def detect_graphic_elements(
|
||||
|
||||
# Skip page-spanning regions
|
||||
if bw > w * 0.5 or bh > h * 0.5:
|
||||
logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
|
||||
logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
|
||||
continue
|
||||
|
||||
bbox_area = bw * bh
|
||||
@@ -188,8 +187,8 @@ def detect_graphic_elements(
|
||||
|
||||
# If most of the region is covered by word boxes → colored text, skip
|
||||
if word_overlap > 0.5:
|
||||
logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
|
||||
bx, by, bw, bh, word_overlap * 100)
|
||||
logger.debug("GraphicDetect PASS1 skip text region (%d,%d) %dx%d overlap=%.0f%%",
|
||||
bx, by, bw, bh, word_overlap * 100)
|
||||
continue
|
||||
|
||||
# Need a minimum number of colored pixels (not just dilated area)
|
||||
@@ -209,8 +208,7 @@ def detect_graphic_elements(
|
||||
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
conf = min(0.95, 0.5 + density * 0.5)
|
||||
|
||||
logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
|
||||
"color_px=%d word_overlap=%.0f%% color=%s",
|
||||
logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d overlap=%.0f%% %s",
|
||||
bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
@@ -256,7 +254,7 @@ def detect_graphic_elements(
|
||||
contours_ink, _ = cv2.findContours(
|
||||
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
|
||||
logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
|
||||
|
||||
for cnt in contours_ink:
|
||||
area = cv2.contourArea(cnt)
|
||||
@@ -267,8 +265,8 @@ def detect_graphic_elements(
|
||||
if bw > w * 0.8 or bh > h * 0.8:
|
||||
continue
|
||||
|
||||
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
|
||||
bx, by, bw, bh, int(area))
|
||||
logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
|
||||
bx, by, bw, bh, int(area))
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=int(area), shape="illustration",
|
||||
|
||||
Reference in New Issue
Block a user