feat: add border ghost filter + graphic detection tests + structure overlay

- Add _filter_border_ghost_words() to remove OCR artefacts from box borders
  (vertical + horizontal edge detection, column cleanup, re-indexing)
- Add 20 tests for border ghost filter (basic filtering + column cleanup)
- Add 24 tests for cv_graphic_detect (color detection, word overlap, boxes)
- Clean up cv_graphic_detect.py logging (per-candidate → DEBUG)
- Add structure overlay layer to StepReconstruction (boxes + graphics toggle)
- Show border_ghosts_removed badge in StepStructureDetection
- Update MkDocs with structure detection documentation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 18:28:53 +01:00
parent 6668661895
commit 729ebff63c
8 changed files with 1006 additions and 29 deletions

View File

@@ -121,10 +121,9 @@ def detect_graphic_elements(
return []
h, w = img_bgr.shape[:2]
img_area = h * w
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or []))
logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or []))
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = []
@@ -161,7 +160,7 @@ def detect_graphic_elements(
contours_regions, _ = cv2.findContours(
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
for cnt in contours_regions:
bx, by, bw, bh = cv2.boundingRect(cnt)
@@ -172,7 +171,7 @@ def detect_graphic_elements(
# Skip page-spanning regions
if bw > w * 0.5 or bh > h * 0.5:
logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
continue
bbox_area = bw * bh
@@ -188,8 +187,8 @@ def detect_graphic_elements(
# If most of the region is covered by word boxes → colored text, skip
if word_overlap > 0.5:
logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
bx, by, bw, bh, word_overlap * 100)
logger.debug("GraphicDetect PASS1 skip text region (%d,%d) %dx%d overlap=%.0f%%",
bx, by, bw, bh, word_overlap * 100)
continue
# Need a minimum number of colored pixels (not just dilated area)
@@ -209,8 +208,7 @@ def detect_graphic_elements(
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
conf = min(0.95, 0.5 + density * 0.5)
logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
"color_px=%d word_overlap=%.0f%% color=%s",
logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d overlap=%.0f%% %s",
bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
@@ -256,7 +254,7 @@ def detect_graphic_elements(
contours_ink, _ = cv2.findContours(
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
for cnt in contours_ink:
area = cv2.contourArea(cnt)
@@ -267,8 +265,8 @@ def detect_graphic_elements(
if bw > w * 0.8 or bh > h * 0.8:
continue
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area))
logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area))
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",