feat: integrate graphic element detection into structure step
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Add cv_graphic_detect.py for detecting non-text visual elements (arrows, circles, lines, exclamation marks, icons, illustrations). Draw detected graphics on structure overlay image and display them in the frontend StepStructureDetection component with shape counts and individual listings. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -73,6 +73,7 @@ from cv_vocab_pipeline import (
|
||||
)
|
||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||
from cv_color_detect import detect_word_colors, recover_colored_text, _COLOR_RANGES, _COLOR_HEX
|
||||
from cv_graphic_detect import detect_graphic_elements
|
||||
from cv_words_first import build_grid_from_words
|
||||
from ocr_pipeline_session_store import (
|
||||
create_session_db,
|
||||
@@ -1304,6 +1305,16 @@ async def detect_structure(session_id: str):
|
||||
if pixel_count > 50: # minimum threshold
|
||||
color_summary[color_name] = pixel_count
|
||||
|
||||
# --- Graphic element detection ---
|
||||
box_dicts = [
|
||||
{"x": b.x, "y": b.y, "w": b.width, "h": b.height}
|
||||
for b in boxes
|
||||
]
|
||||
graphics = detect_graphic_elements(
|
||||
img_bgr, words,
|
||||
detected_boxes=box_dicts,
|
||||
)
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
result_dict = {
|
||||
@@ -1332,6 +1343,17 @@ async def detect_structure(session_id: str):
|
||||
}
|
||||
for z in zones
|
||||
],
|
||||
"graphics": [
|
||||
{
|
||||
"x": g.x, "y": g.y, "w": g.width, "h": g.height,
|
||||
"area": g.area,
|
||||
"shape": g.shape,
|
||||
"color_name": g.color_name,
|
||||
"color_hex": g.color_hex,
|
||||
"confidence": round(g.confidence, 2),
|
||||
}
|
||||
for g in graphics
|
||||
],
|
||||
"color_pixel_counts": color_summary,
|
||||
"has_words": len(words) > 0,
|
||||
"word_count": len(words),
|
||||
@@ -1342,8 +1364,8 @@ async def detect_structure(session_id: str):
|
||||
await update_session_db(session_id, structure_result=result_dict)
|
||||
cached["structure_result"] = result_dict
|
||||
|
||||
logger.info("detect-structure session %s: %d boxes, %d zones, %.2fs",
|
||||
session_id, len(boxes), len(zones), duration)
|
||||
logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs",
|
||||
session_id, len(boxes), len(zones), len(graphics), duration)
|
||||
|
||||
return {"session_id": session_id, **result_dict}
|
||||
|
||||
@@ -1777,6 +1799,48 @@ async def _get_structure_overlay(session_id: str) -> Response:
|
||||
continue
|
||||
cv2.drawContours(img, [cnt], -1, draw_color, 2)
|
||||
|
||||
# --- Draw graphic elements ---
|
||||
graphics_data = structure.get("graphics", [])
|
||||
shape_icons = {
|
||||
"arrow": "ARROW",
|
||||
"circle": "CIRCLE",
|
||||
"line": "LINE",
|
||||
"exclamation": "!",
|
||||
"dot": "DOT",
|
||||
"icon": "ICON",
|
||||
"illustration": "ILLUST",
|
||||
}
|
||||
for gfx in graphics_data:
|
||||
gx, gy = gfx["x"], gfx["y"]
|
||||
gw, gh = gfx["w"], gfx["h"]
|
||||
shape = gfx.get("shape", "icon")
|
||||
color_hex = gfx.get("color_hex", "#6b7280")
|
||||
conf = gfx.get("confidence", 0)
|
||||
|
||||
# Pick draw color based on element color (BGR)
|
||||
gfx_bgr = bg_hex_to_bgr.get(color_hex, (128, 114, 107))
|
||||
|
||||
# Draw bounding box (dashed style via short segments)
|
||||
dash = 6
|
||||
for seg_x in range(gx, gx + gw, dash * 2):
|
||||
end_x = min(seg_x + dash, gx + gw)
|
||||
cv2.line(img, (seg_x, gy), (end_x, gy), gfx_bgr, 2)
|
||||
cv2.line(img, (seg_x, gy + gh), (end_x, gy + gh), gfx_bgr, 2)
|
||||
for seg_y in range(gy, gy + gh, dash * 2):
|
||||
end_y = min(seg_y + dash, gy + gh)
|
||||
cv2.line(img, (gx, seg_y), (gx, end_y), gfx_bgr, 2)
|
||||
cv2.line(img, (gx + gw, seg_y), (gx + gw, end_y), gfx_bgr, 2)
|
||||
|
||||
# Label
|
||||
icon = shape_icons.get(shape, shape.upper()[:5])
|
||||
label = f"{icon} {int(conf * 100)}%"
|
||||
# White background for readability
|
||||
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
|
||||
lx = gx + 2
|
||||
ly = max(gy - 4, th + 4)
|
||||
cv2.rectangle(img, (lx - 1, ly - th - 2), (lx + tw + 2, ly + 3), (255, 255, 255), -1)
|
||||
cv2.putText(img, label, (lx, ly), cv2.FONT_HERSHEY_SIMPLEX, 0.4, gfx_bgr, 1)
|
||||
|
||||
# Encode result
|
||||
_, png_buf = cv2.imencode(".png", img)
|
||||
return Response(content=png_buf.tobytes(), media_type="image/png")
|
||||
|
||||
Reference in New Issue
Block a user