feat: add border ghost filter + graphic detection tests + structure overlay

- Add _filter_border_ghost_words() to remove OCR artefacts from box borders (vertical + horizontal edge detection, column cleanup, re-indexing) - Add 20 tests for border ghost filter (basic filtering + column cleanup) - Add 24 tests for cv_graphic_detect (color detection, word overlap, boxes) - Clean up cv_graphic_detect.py logging (per-candidate → DEBUG) - Add structure overlay layer to StepReconstruction (boxes + graphics toggle) - Show border_ghosts_removed badge in StepStructureDetection - Update MkDocs with structure detection documentation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 18:28:53 +01:00
parent 6668661895
commit 729ebff63c
8 changed files with 1006 additions and 29 deletions
--- a/klausur-service/backend/cv_graphic_detect.py
+++ b/klausur-service/backend/cv_graphic_detect.py
@@ -121,10 +121,9 @@ def detect_graphic_elements(
        return []

    h, w = img_bgr.shape[:2]
-    img_area = h * w

-    logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
-                w, h, len(word_boxes), len(detected_boxes or []))
+    logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
+                 w, h, len(word_boxes), len(detected_boxes or []))

    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    candidates: List[GraphicElement] = []
@@ -161,7 +160,7 @@ def detect_graphic_elements(
    contours_regions, _ = cv2.findContours(
        region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
-    logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
+    logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))

    for cnt in contours_regions:
        bx, by, bw, bh = cv2.boundingRect(cnt)
@@ -172,7 +171,7 @@ def detect_graphic_elements(

        # Skip page-spanning regions
        if bw > w * 0.5 or bh > h * 0.5:
-            logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
+            logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
            continue

        bbox_area = bw * bh
@@ -188,8 +187,8 @@ def detect_graphic_elements(

        # If most of the region is covered by word boxes → colored text, skip
        if word_overlap > 0.5:
-            logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
-                        bx, by, bw, bh, word_overlap * 100)
+            logger.debug("GraphicDetect PASS1 skip text region (%d,%d) %dx%d overlap=%.0f%%",
+                         bx, by, bw, bh, word_overlap * 100)
            continue

        # Need a minimum number of colored pixels (not just dilated area)
@@ -209,8 +208,7 @@ def detect_graphic_elements(
        density = color_pixel_count / bbox_area if bbox_area > 0 else 0
        conf = min(0.95, 0.5 + density * 0.5)

-        logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
-                     "color_px=%d word_overlap=%.0f%% color=%s",
+        logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d overlap=%.0f%% %s",
                     bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
@@ -256,7 +254,7 @@ def detect_graphic_elements(
    contours_ink, _ = cv2.findContours(
        ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
-    logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
+    logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))

    for cnt in contours_ink:
        area = cv2.contourArea(cnt)
@@ -267,8 +265,8 @@ def detect_graphic_elements(
        if bw > w * 0.8 or bh > h * 0.8:
            continue

-        logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
-                    bx, by, bw, bh, int(area))
+        logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
+                     bx, by, bw, bh, int(area))
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
            area=int(area), shape="illustration",