fix: robust colored-text detection in graphic filter
The 25x25 dilation kernel merges nearby green words into large regions, so pixel-overlap with OCR word boxes drops below 50%. Previous density checks alone weren't sufficient. New multi-layered approach: - Count OCR word CENTROIDS inside each colored region - ≥2 centroids → definitely text (images don't produce multiple words) - 1 centroid + 10%+ pixel overlap → likely text - Lower pixel overlap threshold from 50% to 40% - Raise density+height thresholds for text-line detection - Use INFO logging to diagnose remaining false positives Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -181,40 +181,71 @@ def detect_graphic_elements(
|
||||
word_pixel_count = int(np.sum(roi_words > 0))
|
||||
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
|
||||
# Check: how many OCR word centroids fall inside this region?
|
||||
# Colored text that OCR detected will have multiple centroids inside.
|
||||
# Actual images may have 0-1 spurious OCR artifacts.
|
||||
word_centroid_count = sum(
|
||||
1 for wb in word_boxes
|
||||
if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw
|
||||
and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh)
|
||||
)
|
||||
|
||||
# Check: how many actual colored pixels are in this region?
|
||||
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
|
||||
color_pixel_count = int(np.sum(roi_color > 0))
|
||||
|
||||
# If most of the region is covered by word boxes → colored text, skip
|
||||
if word_overlap > 0.5:
|
||||
logger.debug("GraphicDetect PASS1 skip text region (%d,%d) %dx%d overlap=%.0f%%",
|
||||
bx, by, bw, bh, word_overlap * 100)
|
||||
# Color pixel density (before any skip checks so we can log it)
|
||||
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
|
||||
# --- Skip heuristics for colored TEXT (not images) ---
|
||||
|
||||
# (a) High word-box pixel overlap → clearly text
|
||||
if word_overlap > 0.40:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d "
|
||||
"overlap=%.0f%% centroids=%d",
|
||||
bx, by, bw, bh, word_overlap * 100, word_centroid_count,
|
||||
)
|
||||
continue
|
||||
|
||||
# (b) Multiple OCR words detected inside → colored text
|
||||
# (images rarely produce 2+ confident word detections)
|
||||
if word_centroid_count >= 2:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d "
|
||||
"centroids=%d overlap=%.0f%% density=%.0f%%",
|
||||
bx, by, bw, bh, word_centroid_count,
|
||||
word_overlap * 100, density * 100,
|
||||
)
|
||||
continue
|
||||
|
||||
# (c) Even 1 word + some pixel overlap → likely text
|
||||
if word_centroid_count >= 1 and word_overlap > 0.10:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d "
|
||||
"centroids=%d overlap=%.0f%%",
|
||||
bx, by, bw, bh, word_centroid_count, word_overlap * 100,
|
||||
)
|
||||
continue
|
||||
|
||||
# Need a minimum number of colored pixels (not just dilated area)
|
||||
if color_pixel_count < 200:
|
||||
continue
|
||||
|
||||
# Color pixel density: fraction of bbox filled with colored pixels.
|
||||
# Text strokes are thin → low density (typically 5-20%).
|
||||
# Actual images/graphics are filled → high density (30%+).
|
||||
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
|
||||
# Very low density → almost certainly colored text, not an image
|
||||
# (d) Very low density → thin strokes, almost certainly text
|
||||
if density < 0.20:
|
||||
logger.debug(
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
|
||||
"density=%.0f%% (likely colored text)",
|
||||
bx, by, bw, bh, density * 100,
|
||||
)
|
||||
continue
|
||||
|
||||
# Moderate density + small height → likely a colored text line
|
||||
# (text-line height is typically < 3% of page height)
|
||||
if density < 0.30 and bh < h * 0.04:
|
||||
logger.debug(
|
||||
"GraphicDetect PASS1 skip text-height region (%d,%d) %dx%d "
|
||||
"density=%.0f%% height=%.1f%% (likely colored text line)",
|
||||
# (e) Moderate density + small height → colored text line
|
||||
if density < 0.35 and bh < h * 0.05:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip text-height (%d,%d) %dx%d "
|
||||
"density=%.0f%% height=%.1f%%",
|
||||
bx, by, bw, bh, density * 100, 100.0 * bh / h,
|
||||
)
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user