fix: prevent colored text from being falsely detected as graphics

Add color pixel density checks to cv_graphic_detect.py Pass 1: - density < 20% → skip (text strokes are thin, images are filled) - density < 30% + height < 4% page → skip (colored text line) This fixes green headings (Insel, Internet, Inuit) being removed as graphic regions, which also caused word reordering in lines. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 17:30:35 +01:00
parent d66efdecf5
commit 6e1d715d0d
1 changed files with 26 additions and 3 deletions
@@ -195,6 +195,30 @@ def detect_graphic_elements(
        if color_pixel_count < 200:
            continue
        # Color pixel density: fraction of bbox filled with colored pixels.
        # Text strokes are thin → low density (typically 5-20%).
        # Actual images/graphics are filled → high density (30%+).
        density = color_pixel_count / bbox_area if bbox_area > 0 else 0
        # Very low density → almost certainly colored text, not an image
        if density < 0.20:
            logger.debug(
                "GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
                "density=%.0f%% (likely colored text)",
                bx, by, bw, bh, density * 100,
            )
            continue
        # Moderate density + small height → likely a colored text line
        # (text-line height is typically < 3% of page height)
        if density < 0.30 and bh < h * 0.04:
            logger.debug(
                "GraphicDetect PASS1 skip text-height region (%d,%d) %dx%d "
                "density=%.0f%% height=%.1f%% (likely colored text line)",
                bx, by, bw, bh, density * 100, 100.0 * bh / h,
            )
            continue
        # Determine dominant color from the actual colored pixels
        roi_hsv = hsv[by:by + bh, bx:bx + bw]
        color_px_mask = roi_color > 0
@@ -205,11 +229,10 @@ def detect_graphic_elements(
            color_name, color_hex = "black", _COLOR_HEX["black"]
        # Confidence based on color density and low word overlap
        density = color_pixel_count / bbox_area if bbox_area > 0 else 0
        conf = min(0.95, 0.5 + density * 0.5)
-        logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d overlap=%.0f%% %s",
+        logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d density=%.0f%% overlap=%.0f%% %s",
-                     bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
+                     bx, by, bw, bh, color_pixel_count, density * 100, word_overlap * 100, color_name)
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
            area=color_pixel_count,