fix: robust colored-text detection in graphic filter

The 25x25 dilation kernel merges nearby green words into large regions,
so pixel-overlap with OCR word boxes drops below 50%. Previous density
checks alone weren't sufficient.

New multi-layered approach:
- Count OCR word CENTROIDS inside each colored region
- ≥2 centroids → definitely text (images don't produce multiple words)
- 1 centroid + 10%+ pixel overlap → likely text
- Lower pixel overlap threshold from 50% to 40%
- Raise density+height thresholds for text-line detection
- Use INFO logging to diagnose remaining false positives

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-17 18:09:16 +01:00
parent 6e1d715d0d
commit a079ffe8e9

View File

@@ -181,40 +181,71 @@ def detect_graphic_elements(
word_pixel_count = int(np.sum(roi_words > 0))
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
# Check: how many OCR word centroids fall inside this region?
# Colored text that OCR detected will have multiple centroids inside.
# Actual images may have 0-1 spurious OCR artifacts.
word_centroid_count = sum(
1 for wb in word_boxes
if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw
and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh)
)
# Check: how many actual colored pixels are in this region?
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
color_pixel_count = int(np.sum(roi_color > 0))
# If most of the region is covered by word boxes → colored text, skip
if word_overlap > 0.5:
logger.debug("GraphicDetect PASS1 skip text region (%d,%d) %dx%d overlap=%.0f%%",
bx, by, bw, bh, word_overlap * 100)
# Color pixel density (before any skip checks so we can log it)
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
# --- Skip heuristics for colored TEXT (not images) ---
# (a) High word-box pixel overlap → clearly text
if word_overlap > 0.40:
logger.info(
"GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d "
"overlap=%.0f%% centroids=%d",
bx, by, bw, bh, word_overlap * 100, word_centroid_count,
)
continue
# (b) Multiple OCR words detected inside → colored text
# (images rarely produce 2+ confident word detections)
if word_centroid_count >= 2:
logger.info(
"GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d "
"centroids=%d overlap=%.0f%% density=%.0f%%",
bx, by, bw, bh, word_centroid_count,
word_overlap * 100, density * 100,
)
continue
# (c) Even 1 word + some pixel overlap → likely text
if word_centroid_count >= 1 and word_overlap > 0.10:
logger.info(
"GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d "
"centroids=%d overlap=%.0f%%",
bx, by, bw, bh, word_centroid_count, word_overlap * 100,
)
continue
# Need a minimum number of colored pixels (not just dilated area)
if color_pixel_count < 200:
continue
# Color pixel density: fraction of bbox filled with colored pixels.
# Text strokes are thin → low density (typically 5-20%).
# Actual images/graphics are filled → high density (30%+).
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
# Very low density → almost certainly colored text, not an image
# (d) Very low density → thin strokes, almost certainly text
if density < 0.20:
logger.debug(
logger.info(
"GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
"density=%.0f%% (likely colored text)",
bx, by, bw, bh, density * 100,
)
continue
# Moderate density + small height → likely a colored text line
# (text-line height is typically < 3% of page height)
if density < 0.30 and bh < h * 0.04:
logger.debug(
"GraphicDetect PASS1 skip text-height region (%d,%d) %dx%d "
"density=%.0f%% height=%.1f%% (likely colored text line)",
# (e) Moderate density + small height → colored text line
if density < 0.35 and bh < h * 0.05:
logger.info(
"GraphicDetect PASS1 skip text-height (%d,%d) %dx%d "
"density=%.0f%% height=%.1f%%",
bx, by, bw, bh, density * 100, 100.0 * bh / h,
)
continue