fix: suppress text fragments in graphic detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 20s
- Raise min_area from 30 to 200 (text fragments are small) - Raise word_pad from 3 to 10px (OCR bboxes are tight) - Reduce morph close kernel from 5x5 to 3x3 (avoid reconnecting text) - Tighten arrow detection: min 20px, circularity<0.35, >=2 defects - Add 'noise' category for too-small elements, filter them out - Raise min dimension from 4 to 8px - Add debug logging for word count and exclusion coverage - Raise max_area_ratio to 0.25 (allow larger illustrations) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -102,6 +102,8 @@ def _classify_shape(
|
|||||||
"""Classify contour shape → (shape_name, confidence).
|
"""Classify contour shape → (shape_name, confidence).
|
||||||
|
|
||||||
Uses circularity, aspect ratio, solidity, and vertex count.
|
Uses circularity, aspect ratio, solidity, and vertex count.
|
||||||
|
Only classifies as arrow/circle/line if the element is large enough
|
||||||
|
to be a genuine graphic (not a text fragment).
|
||||||
"""
|
"""
|
||||||
aspect = bw / bh if bh > 0 else 1.0
|
aspect = bw / bh if bh > 0 else 1.0
|
||||||
perimeter = cv2.arcLength(contour, True)
|
perimeter = cv2.arcLength(contour, True)
|
||||||
@@ -116,46 +118,47 @@ def _classify_shape(
|
|||||||
approx = cv2.approxPolyDP(contour, epsilon, True)
|
approx = cv2.approxPolyDP(contour, epsilon, True)
|
||||||
vertices = len(approx)
|
vertices = len(approx)
|
||||||
|
|
||||||
# --- Arrow detection ---
|
min_dim = min(bw, bh)
|
||||||
# Arrows typically have: vertices 5-8, moderate solidity (0.4-0.8),
|
max_dim = max(bw, bh)
|
||||||
# moderate aspect ratio, low circularity
|
|
||||||
if 4 <= vertices <= 9 and 0.3 < solidity < 0.85 and circularity < 0.5:
|
# --- Circle / balloon --- (check first, most reliable)
|
||||||
# Check for a pointed tip via convexity defects
|
# Must be reasonably large (not a dot/period)
|
||||||
|
if circularity > 0.70 and 0.6 < aspect < 1.7 and min_dim > 25:
|
||||||
|
conf = min(0.95, circularity)
|
||||||
|
return "circle", conf
|
||||||
|
|
||||||
|
# --- Arrow detection --- (strict: must be sizable, distinct shape)
|
||||||
|
# Arrows must be at least 20px in both dimensions
|
||||||
|
if (min_dim > 20 and max_dim > 30
|
||||||
|
and 5 <= vertices <= 9
|
||||||
|
and 0.35 < solidity < 0.80
|
||||||
|
and circularity < 0.35):
|
||||||
hull_idx = cv2.convexHull(contour, returnPoints=False)
|
hull_idx = cv2.convexHull(contour, returnPoints=False)
|
||||||
if len(hull_idx) >= 4:
|
if len(hull_idx) >= 4:
|
||||||
try:
|
try:
|
||||||
defects = cv2.convexityDefects(contour, hull_idx)
|
defects = cv2.convexityDefects(contour, hull_idx)
|
||||||
if defects is not None and len(defects) >= 1:
|
if defects is not None and len(defects) >= 2:
|
||||||
# Significant defect = pointed shape
|
|
||||||
max_depth = max(d[0][3] for d in defects) / 256.0
|
max_depth = max(d[0][3] for d in defects) / 256.0
|
||||||
if max_depth > min(bw, bh) * 0.15:
|
if max_depth > min_dim * 0.25:
|
||||||
return "arrow", min(0.75, 0.5 + max_depth / max(bw, bh))
|
return "arrow", min(0.75, 0.5 + max_depth / max_dim)
|
||||||
except cv2.error:
|
except cv2.error:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# --- Circle / balloon ---
|
# --- Line (decorative rule, separator) ---
|
||||||
if circularity > 0.65 and 0.5 < aspect < 2.0:
|
# Must be long enough to not be a dash/hyphen
|
||||||
conf = min(0.95, circularity)
|
if (aspect > 6.0 or aspect < 1 / 6.0) and max_dim > 40:
|
||||||
return "circle", conf
|
|
||||||
|
|
||||||
# --- Line ---
|
|
||||||
if aspect > 4.0 or aspect < 0.25:
|
|
||||||
return "line", 0.7
|
return "line", 0.7
|
||||||
|
|
||||||
# --- Exclamation mark (tall narrow + high solidity) ---
|
# --- Larger illustration (drawing, image) ---
|
||||||
if aspect < 0.45 and bh > 12 and solidity > 0.5:
|
if area > 3000 and min_dim > 30:
|
||||||
return "exclamation", 0.7
|
return "illustration", 0.6
|
||||||
|
|
||||||
# --- Dot / bullet (small, roughly square, high solidity) ---
|
# --- Generic icon (moderate size, non-text shape) ---
|
||||||
if max(bw, bh) < 20 and 0.5 < aspect < 2.0 and solidity > 0.6:
|
if area > 500 and min_dim > 15:
|
||||||
return "dot", 0.6
|
return "icon", 0.4
|
||||||
|
|
||||||
# --- Larger illustration ---
|
# Everything else is too small or text-like — skip
|
||||||
if area > 2000:
|
return "noise", 0.0
|
||||||
return "illustration", 0.5
|
|
||||||
|
|
||||||
# --- Generic icon ---
|
|
||||||
return "icon", 0.4
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -166,10 +169,10 @@ def detect_graphic_elements(
|
|||||||
img_bgr: np.ndarray,
|
img_bgr: np.ndarray,
|
||||||
word_boxes: List[Dict],
|
word_boxes: List[Dict],
|
||||||
detected_boxes: Optional[List[Dict]] = None,
|
detected_boxes: Optional[List[Dict]] = None,
|
||||||
min_area: int = 30,
|
min_area: int = 200,
|
||||||
max_area_ratio: float = 0.05,
|
max_area_ratio: float = 0.25,
|
||||||
word_pad: int = 3,
|
word_pad: int = 10,
|
||||||
max_elements: int = 80,
|
max_elements: int = 50,
|
||||||
) -> List[GraphicElement]:
|
) -> List[GraphicElement]:
|
||||||
"""Find non-text graphical elements on the page.
|
"""Find non-text graphical elements on the page.
|
||||||
|
|
||||||
@@ -181,9 +184,9 @@ def detect_graphic_elements(
|
|||||||
img_bgr: BGR color image.
|
img_bgr: BGR color image.
|
||||||
word_boxes: List of OCR word dicts with left/top/width/height.
|
word_boxes: List of OCR word dicts with left/top/width/height.
|
||||||
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
||||||
min_area: Minimum contour area to keep.
|
min_area: Minimum contour area to keep (200 filters text fragments).
|
||||||
max_area_ratio: Maximum area as fraction of image area.
|
max_area_ratio: Maximum area as fraction of image area.
|
||||||
word_pad: Padding around word boxes for exclusion.
|
word_pad: Padding around word boxes for exclusion (10px covers font edges).
|
||||||
max_elements: Maximum number of elements to return.
|
max_elements: Maximum number of elements to return.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -195,16 +198,17 @@ def detect_graphic_elements(
|
|||||||
h, w = img_bgr.shape[:2]
|
h, w = img_bgr.shape[:2]
|
||||||
max_area = int(h * w * max_area_ratio)
|
max_area = int(h * w * max_area_ratio)
|
||||||
|
|
||||||
|
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
||||||
|
w, h, len(word_boxes), len(detected_boxes or []))
|
||||||
|
|
||||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||||
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||||||
|
|
||||||
# --- 1. Build ink mask: dark pixels + saturated colored pixels ---
|
# --- 1. Build ink mask: dark pixels + saturated colored pixels ---
|
||||||
# Adaptive threshold for dark ink
|
|
||||||
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||||
|
|
||||||
# Saturated colored pixels (catches colored arrows, markers)
|
# Saturated colored pixels (catches colored arrows, markers)
|
||||||
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
||||||
# Only include saturated pixels that are also reasonably dark (not background)
|
|
||||||
val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
|
val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
|
||||||
color_ink = cv2.bitwise_and(sat_mask, val_mask)
|
color_ink = cv2.bitwise_and(sat_mask, val_mask)
|
||||||
|
|
||||||
@@ -236,15 +240,19 @@ def detect_graphic_elements(
|
|||||||
if x2 > x1 and y2 > y1:
|
if x2 > x1 and y2 > y1:
|
||||||
exclusion[y1:y2, x1:x2] = 255
|
exclusion[y1:y2, x1:x2] = 255
|
||||||
|
|
||||||
|
excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0
|
||||||
|
logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
|
||||||
|
|
||||||
# Subtract exclusion from ink
|
# Subtract exclusion from ink
|
||||||
graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
|
graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
|
||||||
|
|
||||||
# --- 3. Morphological cleanup ---
|
# --- 3. Morphological cleanup ---
|
||||||
# Close small gaps (connects arrow stroke + head)
|
# Close small gaps (connects arrow stroke + head) — but not too large
|
||||||
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
# to avoid reconnecting text fragments
|
||||||
|
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
||||||
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
|
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
|
||||||
# Remove tiny noise
|
# Remove small noise
|
||||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
||||||
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
|
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
|
||||||
|
|
||||||
# --- 4. Find contours ---
|
# --- 4. Find contours ---
|
||||||
@@ -252,6 +260,8 @@ def detect_graphic_elements(
|
|||||||
graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info("GraphicDetect: %d raw contours after exclusion", len(contours))
|
||||||
|
|
||||||
# --- 5. Analyse and classify ---
|
# --- 5. Analyse and classify ---
|
||||||
candidates: List[GraphicElement] = []
|
candidates: List[GraphicElement] = []
|
||||||
for cnt in contours:
|
for cnt in contours:
|
||||||
@@ -260,22 +270,24 @@ def detect_graphic_elements(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||||
if bw < 4 or bh < 4:
|
if bw < 8 or bh < 8:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip elements that are mostly inside the exclusion zone
|
# Skip elements that overlap significantly with the exclusion zone
|
||||||
# (partial overlap with a word)
|
|
||||||
roi_excl = exclusion[by:by + bh, bx:bx + bw]
|
roi_excl = exclusion[by:by + bh, bx:bx + bw]
|
||||||
excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
|
excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
|
||||||
if excl_ratio > 0.6:
|
if excl_ratio > 0.4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Classify shape
|
# Classify shape
|
||||||
shape, conf = _classify_shape(cnt, bw, bh, area)
|
shape, conf = _classify_shape(cnt, bw, bh, area)
|
||||||
|
|
||||||
|
# Skip noise (too small or text-like)
|
||||||
|
if shape == "noise":
|
||||||
|
continue
|
||||||
|
|
||||||
# Determine dominant color
|
# Determine dominant color
|
||||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||||
# Only sample pixels that are actually in the contour
|
|
||||||
cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
|
cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
|
||||||
shifted_cnt = cnt - np.array([bx, by])
|
shifted_cnt = cnt - np.array([bx, by])
|
||||||
cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
|
cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
|
||||||
@@ -305,5 +317,7 @@ def detect_graphic_elements(
|
|||||||
len(result),
|
len(result),
|
||||||
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
logger.info("GraphicDetect: no graphic elements found")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -1236,6 +1236,10 @@ async def detect_structure(session_id: str):
|
|||||||
for cell in word_result["cells"]:
|
for cell in word_result["cells"]:
|
||||||
for wb in (cell.get("word_boxes") or []):
|
for wb in (cell.get("word_boxes") or []):
|
||||||
words.append(wb)
|
words.append(wb)
|
||||||
|
logger.info("detect-structure: word_result present=%s, cells=%d, word_boxes extracted=%d",
|
||||||
|
word_result is not None,
|
||||||
|
len(word_result.get("cells", [])) if word_result else 0,
|
||||||
|
len(words))
|
||||||
# If no words yet, use image dimensions with small margin
|
# If no words yet, use image dimensions with small margin
|
||||||
if words:
|
if words:
|
||||||
content_x = max(0, min(int(wb["left"]) for wb in words))
|
content_x = max(0, min(int(wb["left"]) for wb in words))
|
||||||
|
|||||||
Reference in New Issue
Block a user