feat: integrate graphic element detection into structure step

Add cv_graphic_detect.py for detecting non-text visual elements (arrows, circles, lines, exclamation marks, icons, illustrations). Draw detected graphics on structure overlay image and display them in the frontend StepStructureDetection component with shape counts and individual listings. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 13:21:55 +01:00
parent 1d34785e2b
commit 6b9b280ba3
4 changed files with 447 additions and 2 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -73,6 +73,7 @@ from cv_vocab_pipeline import (
 )
 from cv_box_detect import detect_boxes, split_page_into_zones
 from cv_color_detect import detect_word_colors, recover_colored_text, _COLOR_RANGES, _COLOR_HEX
+from cv_graphic_detect import detect_graphic_elements
 from cv_words_first import build_grid_from_words
 from ocr_pipeline_session_store import (
    create_session_db,
@@ -1304,6 +1305,16 @@ async def detect_structure(session_id: str):
        if pixel_count > 50:  # minimum threshold
            color_summary[color_name] = pixel_count

+    # --- Graphic element detection ---
+    box_dicts = [
+        {"x": b.x, "y": b.y, "w": b.width, "h": b.height}
+        for b in boxes
+    ]
+    graphics = detect_graphic_elements(
+        img_bgr, words,
+        detected_boxes=box_dicts,
+    )
+
    duration = time.time() - t0

    result_dict = {
@@ -1332,6 +1343,17 @@ async def detect_structure(session_id: str):
            }
            for z in zones
        ],
+        "graphics": [
+            {
+                "x": g.x, "y": g.y, "w": g.width, "h": g.height,
+                "area": g.area,
+                "shape": g.shape,
+                "color_name": g.color_name,
+                "color_hex": g.color_hex,
+                "confidence": round(g.confidence, 2),
+            }
+            for g in graphics
+        ],
        "color_pixel_counts": color_summary,
        "has_words": len(words) > 0,
        "word_count": len(words),
@@ -1342,8 +1364,8 @@ async def detect_structure(session_id: str):
    await update_session_db(session_id, structure_result=result_dict)
    cached["structure_result"] = result_dict

-    logger.info("detect-structure session %s: %d boxes, %d zones, %.2fs",
-                session_id, len(boxes), len(zones), duration)
+    logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs",
+                session_id, len(boxes), len(zones), len(graphics), duration)

    return {"session_id": session_id, **result_dict}

@@ -1777,6 +1799,48 @@ async def _get_structure_overlay(session_id: str) -> Response:
                continue
            cv2.drawContours(img, [cnt], -1, draw_color, 2)

+    # --- Draw graphic elements ---
+    graphics_data = structure.get("graphics", [])
+    shape_icons = {
+        "arrow": "ARROW",
+        "circle": "CIRCLE",
+        "line": "LINE",
+        "exclamation": "!",
+        "dot": "DOT",
+        "icon": "ICON",
+        "illustration": "ILLUST",
+    }
+    for gfx in graphics_data:
+        gx, gy = gfx["x"], gfx["y"]
+        gw, gh = gfx["w"], gfx["h"]
+        shape = gfx.get("shape", "icon")
+        color_hex = gfx.get("color_hex", "#6b7280")
+        conf = gfx.get("confidence", 0)
+
+        # Pick draw color based on element color (BGR)
+        gfx_bgr = bg_hex_to_bgr.get(color_hex, (128, 114, 107))
+
+        # Draw bounding box (dashed style via short segments)
+        dash = 6
+        for seg_x in range(gx, gx + gw, dash * 2):
+            end_x = min(seg_x + dash, gx + gw)
+            cv2.line(img, (seg_x, gy), (end_x, gy), gfx_bgr, 2)
+            cv2.line(img, (seg_x, gy + gh), (end_x, gy + gh), gfx_bgr, 2)
+        for seg_y in range(gy, gy + gh, dash * 2):
+            end_y = min(seg_y + dash, gy + gh)
+            cv2.line(img, (gx, seg_y), (gx, end_y), gfx_bgr, 2)
+            cv2.line(img, (gx + gw, seg_y), (gx + gw, end_y), gfx_bgr, 2)
+
+        # Label
+        icon = shape_icons.get(shape, shape.upper()[:5])
+        label = f"{icon} {int(conf * 100)}%"
+        # White background for readability
+        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
+        lx = gx + 2
+        ly = max(gy - 4, th + 4)
+        cv2.rectangle(img, (lx - 1, ly - th - 2), (lx + tw + 2, ly + 3), (255, 255, 255), -1)
+        cv2.putText(img, label, (lx, ly), cv2.FONT_HERSHEY_SIMPLEX, 0.4, gfx_bgr, 1)
+
    # Encode result
    _, png_buf = cv2.imencode(".png", img)
    return Response(content=png_buf.tobytes(), media_type="image/png")