feat: integrate graphic element detection into structure step
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s

Add cv_graphic_detect.py for detecting non-text visual elements (arrows,
circles, lines, exclamation marks, icons, illustrations). Draw detected
graphics on structure overlay image and display them in the frontend
StepStructureDetection component with shape counts and individual listings.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 13:21:55 +01:00
parent 1d34785e2b
commit 6b9b280ba3
4 changed files with 447 additions and 2 deletions

View File

@@ -73,6 +73,7 @@ from cv_vocab_pipeline import (
)
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_color_detect import detect_word_colors, recover_colored_text, _COLOR_RANGES, _COLOR_HEX
from cv_graphic_detect import detect_graphic_elements
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
create_session_db,
@@ -1304,6 +1305,16 @@ async def detect_structure(session_id: str):
if pixel_count > 50: # minimum threshold
color_summary[color_name] = pixel_count
# --- Graphic element detection ---
box_dicts = [
{"x": b.x, "y": b.y, "w": b.width, "h": b.height}
for b in boxes
]
graphics = detect_graphic_elements(
img_bgr, words,
detected_boxes=box_dicts,
)
duration = time.time() - t0
result_dict = {
@@ -1332,6 +1343,17 @@ async def detect_structure(session_id: str):
}
for z in zones
],
"graphics": [
{
"x": g.x, "y": g.y, "w": g.width, "h": g.height,
"area": g.area,
"shape": g.shape,
"color_name": g.color_name,
"color_hex": g.color_hex,
"confidence": round(g.confidence, 2),
}
for g in graphics
],
"color_pixel_counts": color_summary,
"has_words": len(words) > 0,
"word_count": len(words),
@@ -1342,8 +1364,8 @@ async def detect_structure(session_id: str):
await update_session_db(session_id, structure_result=result_dict)
cached["structure_result"] = result_dict
logger.info("detect-structure session %s: %d boxes, %d zones, %.2fs",
session_id, len(boxes), len(zones), duration)
logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs",
session_id, len(boxes), len(zones), len(graphics), duration)
return {"session_id": session_id, **result_dict}
@@ -1777,6 +1799,48 @@ async def _get_structure_overlay(session_id: str) -> Response:
continue
cv2.drawContours(img, [cnt], -1, draw_color, 2)
# --- Draw graphic elements ---
graphics_data = structure.get("graphics", [])
shape_icons = {
"arrow": "ARROW",
"circle": "CIRCLE",
"line": "LINE",
"exclamation": "!",
"dot": "DOT",
"icon": "ICON",
"illustration": "ILLUST",
}
for gfx in graphics_data:
gx, gy = gfx["x"], gfx["y"]
gw, gh = gfx["w"], gfx["h"]
shape = gfx.get("shape", "icon")
color_hex = gfx.get("color_hex", "#6b7280")
conf = gfx.get("confidence", 0)
# Pick draw color based on element color (BGR)
gfx_bgr = bg_hex_to_bgr.get(color_hex, (128, 114, 107))
# Draw bounding box (dashed style via short segments)
dash = 6
for seg_x in range(gx, gx + gw, dash * 2):
end_x = min(seg_x + dash, gx + gw)
cv2.line(img, (seg_x, gy), (end_x, gy), gfx_bgr, 2)
cv2.line(img, (seg_x, gy + gh), (end_x, gy + gh), gfx_bgr, 2)
for seg_y in range(gy, gy + gh, dash * 2):
end_y = min(seg_y + dash, gy + gh)
cv2.line(img, (gx, seg_y), (gx, end_y), gfx_bgr, 2)
cv2.line(img, (gx + gw, seg_y), (gx + gw, end_y), gfx_bgr, 2)
# Label
icon = shape_icons.get(shape, shape.upper()[:5])
label = f"{icon} {int(conf * 100)}%"
# White background for readability
(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
lx = gx + 2
ly = max(gy - 4, th + 4)
cv2.rectangle(img, (lx - 1, ly - th - 2), (lx + tw + 2, ly + 3), (255, 255, 255), -1)
cv2.putText(img, label, (lx, ly), cv2.FONT_HERSHEY_SIMPLEX, 0.4, gfx_bgr, 1)
# Encode result
_, png_buf = cv2.imencode(".png", img)
return Response(content=png_buf.tobytes(), media_type="image/png")