Python (6 files in klausur-service): - rbac.py (1,132 → 4), admin_api.py (1,012 → 4) - routes/eh.py (1,111 → 4), ocr_pipeline_geometry.py (1,105 → 5) Python (2 files in backend-lehrer): - unit_api.py (1,226 → 6), game_api.py (1,129 → 5) Website (6 page files): - 4x klausur-korrektur pages (1,249-1,328 LOC each) → shared components in website/components/klausur-korrektur/ (17 shared files) - companion (1,057 → 10), magic-help (1,017 → 8) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
300 lines
10 KiB
Python
300 lines
10 KiB
Python
"""
|
|
OCR Pipeline Structure Detection and Exclude Regions
|
|
|
|
Detect document structure (boxes, zones, color regions, graphics)
|
|
and manage user-drawn exclude regions.
|
|
Extracted from ocr_pipeline_geometry.py for file-size compliance.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, List
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from fastapi import APIRouter, HTTPException
|
|
from pydantic import BaseModel
|
|
|
|
from cv_box_detect import detect_boxes
|
|
from cv_color_detect import _COLOR_RANGES, _COLOR_HEX
|
|
from cv_graphic_detect import detect_graphic_elements
|
|
from ocr_pipeline_session_store import (
|
|
get_session_db,
|
|
update_session_db,
|
|
)
|
|
from ocr_pipeline_common import (
|
|
_cache,
|
|
_load_session_to_cache,
|
|
_get_cached,
|
|
_filter_border_ghost_words,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Structure Detection Endpoint
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@router.post("/sessions/{session_id}/detect-structure")
|
|
async def detect_structure(session_id: str):
|
|
"""Detect document structure: boxes, zones, and color regions.
|
|
|
|
Runs box detection (line + shading) and color analysis on the cropped
|
|
image. Returns structured JSON with all detected elements for the
|
|
structure visualization step.
|
|
"""
|
|
if session_id not in _cache:
|
|
await _load_session_to_cache(session_id)
|
|
cached = _get_cached(session_id)
|
|
|
|
img_bgr = (
|
|
cached.get("cropped_bgr")
|
|
if cached.get("cropped_bgr") is not None
|
|
else cached.get("dewarped_bgr")
|
|
)
|
|
if img_bgr is None:
|
|
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
|
|
|
|
t0 = time.time()
|
|
h, w = img_bgr.shape[:2]
|
|
|
|
# --- Content bounds from word result (if available) or full image ---
|
|
word_result = cached.get("word_result")
|
|
words: List[Dict] = []
|
|
if word_result and word_result.get("cells"):
|
|
for cell in word_result["cells"]:
|
|
for wb in (cell.get("word_boxes") or []):
|
|
words.append(wb)
|
|
# Fallback: use raw OCR words if cell word_boxes are empty
|
|
if not words and word_result:
|
|
for key in ("raw_paddle_words_split", "raw_tesseract_words", "raw_paddle_words"):
|
|
raw = word_result.get(key, [])
|
|
if raw:
|
|
words = raw
|
|
logger.info("detect-structure: using %d words from %s (no cell word_boxes)", len(words), key)
|
|
break
|
|
# If no words yet, use image dimensions with small margin
|
|
if words:
|
|
content_x = max(0, min(int(wb["left"]) for wb in words))
|
|
content_y = max(0, min(int(wb["top"]) for wb in words))
|
|
content_r = min(w, max(int(wb["left"] + wb["width"]) for wb in words))
|
|
content_b = min(h, max(int(wb["top"] + wb["height"]) for wb in words))
|
|
content_w_px = content_r - content_x
|
|
content_h_px = content_b - content_y
|
|
else:
|
|
margin = int(min(w, h) * 0.03)
|
|
content_x, content_y = margin, margin
|
|
content_w_px = w - 2 * margin
|
|
content_h_px = h - 2 * margin
|
|
|
|
# --- Box detection ---
|
|
boxes = detect_boxes(
|
|
img_bgr,
|
|
content_x=content_x,
|
|
content_w=content_w_px,
|
|
content_y=content_y,
|
|
content_h=content_h_px,
|
|
)
|
|
|
|
# --- Zone splitting ---
|
|
from cv_box_detect import split_page_into_zones as _split_zones
|
|
zones = _split_zones(content_x, content_y, content_w_px, content_h_px, boxes)
|
|
|
|
# --- Color region sampling ---
|
|
# Sample background shading in each detected box
|
|
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
|
box_colors = []
|
|
for box in boxes:
|
|
# Sample the center region of each box
|
|
cy1 = box.y + box.height // 4
|
|
cy2 = box.y + 3 * box.height // 4
|
|
cx1 = box.x + box.width // 4
|
|
cx2 = box.x + 3 * box.width // 4
|
|
cy1 = max(0, min(cy1, h - 1))
|
|
cy2 = max(0, min(cy2, h - 1))
|
|
cx1 = max(0, min(cx1, w - 1))
|
|
cx2 = max(0, min(cx2, w - 1))
|
|
if cy2 > cy1 and cx2 > cx1:
|
|
roi_hsv = hsv[cy1:cy2, cx1:cx2]
|
|
med_h = float(np.median(roi_hsv[:, :, 0]))
|
|
med_s = float(np.median(roi_hsv[:, :, 1]))
|
|
med_v = float(np.median(roi_hsv[:, :, 2]))
|
|
if med_s > 15:
|
|
from cv_color_detect import _hue_to_color_name
|
|
bg_name = _hue_to_color_name(med_h)
|
|
bg_hex = _COLOR_HEX.get(bg_name, "#6b7280")
|
|
else:
|
|
bg_name = "gray" if med_v < 220 else "white"
|
|
bg_hex = "#6b7280" if bg_name == "gray" else "#ffffff"
|
|
else:
|
|
bg_name = "unknown"
|
|
bg_hex = "#6b7280"
|
|
box_colors.append({"color_name": bg_name, "color_hex": bg_hex})
|
|
|
|
# --- Color text detection overview ---
|
|
# Quick scan for colored text regions across the page
|
|
color_summary: Dict[str, int] = {}
|
|
for color_name, ranges in _COLOR_RANGES.items():
|
|
mask = np.zeros((h, w), dtype=np.uint8)
|
|
for lower, upper in ranges:
|
|
mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
|
|
pixel_count = int(np.sum(mask > 0))
|
|
if pixel_count > 50: # minimum threshold
|
|
color_summary[color_name] = pixel_count
|
|
|
|
# --- Graphic element detection ---
|
|
box_dicts = [
|
|
{"x": b.x, "y": b.y, "w": b.width, "h": b.height}
|
|
for b in boxes
|
|
]
|
|
graphics = detect_graphic_elements(
|
|
img_bgr, words,
|
|
detected_boxes=box_dicts,
|
|
)
|
|
|
|
# --- Filter border-ghost words from OCR result ---
|
|
ghost_count = 0
|
|
if boxes and word_result:
|
|
ghost_count = _filter_border_ghost_words(word_result, boxes)
|
|
if ghost_count:
|
|
logger.info("detect-structure: removed %d border-ghost words", ghost_count)
|
|
await update_session_db(session_id, word_result=word_result)
|
|
cached["word_result"] = word_result
|
|
|
|
duration = time.time() - t0
|
|
|
|
# Preserve user-drawn exclude regions from previous run
|
|
prev_sr = cached.get("structure_result") or {}
|
|
prev_exclude = prev_sr.get("exclude_regions", [])
|
|
|
|
result_dict = {
|
|
"image_width": w,
|
|
"image_height": h,
|
|
"content_bounds": {
|
|
"x": content_x, "y": content_y,
|
|
"w": content_w_px, "h": content_h_px,
|
|
},
|
|
"boxes": [
|
|
{
|
|
"x": b.x, "y": b.y, "w": b.width, "h": b.height,
|
|
"confidence": b.confidence,
|
|
"border_thickness": b.border_thickness,
|
|
"bg_color_name": box_colors[i]["color_name"],
|
|
"bg_color_hex": box_colors[i]["color_hex"],
|
|
}
|
|
for i, b in enumerate(boxes)
|
|
],
|
|
"zones": [
|
|
{
|
|
"index": z.index,
|
|
"zone_type": z.zone_type,
|
|
"y": z.y, "h": z.height,
|
|
"x": z.x, "w": z.width,
|
|
}
|
|
for z in zones
|
|
],
|
|
"graphics": [
|
|
{
|
|
"x": g.x, "y": g.y, "w": g.width, "h": g.height,
|
|
"area": g.area,
|
|
"shape": g.shape,
|
|
"color_name": g.color_name,
|
|
"color_hex": g.color_hex,
|
|
"confidence": round(g.confidence, 2),
|
|
}
|
|
for g in graphics
|
|
],
|
|
"exclude_regions": prev_exclude,
|
|
"color_pixel_counts": color_summary,
|
|
"has_words": len(words) > 0,
|
|
"word_count": len(words),
|
|
"border_ghosts_removed": ghost_count,
|
|
"duration_seconds": round(duration, 2),
|
|
}
|
|
|
|
# Persist to session
|
|
await update_session_db(session_id, structure_result=result_dict)
|
|
cached["structure_result"] = result_dict
|
|
|
|
logger.info("detect-structure session %s: %d boxes, %d zones, %d graphics, %.2fs",
|
|
session_id, len(boxes), len(zones), len(graphics), duration)
|
|
|
|
return {"session_id": session_id, **result_dict}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Exclude Regions -- user-drawn rectangles to exclude from OCR results
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class _ExcludeRegionIn(BaseModel):
|
|
x: int
|
|
y: int
|
|
w: int
|
|
h: int
|
|
label: str = ""
|
|
|
|
|
|
class _ExcludeRegionsBatchIn(BaseModel):
|
|
regions: list[_ExcludeRegionIn]
|
|
|
|
|
|
@router.put("/sessions/{session_id}/exclude-regions")
|
|
async def set_exclude_regions(session_id: str, body: _ExcludeRegionsBatchIn):
|
|
"""Replace all exclude regions for a session.
|
|
|
|
Regions are stored inside ``structure_result.exclude_regions``.
|
|
"""
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
sr = session.get("structure_result") or {}
|
|
sr["exclude_regions"] = [r.model_dump() for r in body.regions]
|
|
|
|
# Invalidate grid so it rebuilds with new exclude regions
|
|
await update_session_db(session_id, structure_result=sr, grid_editor_result=None)
|
|
|
|
# Update cache
|
|
if session_id in _cache:
|
|
_cache[session_id]["structure_result"] = sr
|
|
_cache[session_id].pop("grid_editor_result", None)
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"exclude_regions": sr["exclude_regions"],
|
|
"count": len(sr["exclude_regions"]),
|
|
}
|
|
|
|
|
|
@router.delete("/sessions/{session_id}/exclude-regions/{region_index}")
|
|
async def delete_exclude_region(session_id: str, region_index: int):
|
|
"""Remove a single exclude region by index."""
|
|
session = await get_session_db(session_id)
|
|
if not session:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
sr = session.get("structure_result") or {}
|
|
regions = sr.get("exclude_regions", [])
|
|
|
|
if region_index < 0 or region_index >= len(regions):
|
|
raise HTTPException(status_code=404, detail="Region index out of range")
|
|
|
|
removed = regions.pop(region_index)
|
|
sr["exclude_regions"] = regions
|
|
|
|
# Invalidate grid so it rebuilds with new exclude regions
|
|
await update_session_db(session_id, structure_result=sr, grid_editor_result=None)
|
|
|
|
if session_id in _cache:
|
|
_cache[session_id]["structure_result"] = sr
|
|
_cache[session_id].pop("grid_editor_result", None)
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"removed": removed,
|
|
"remaining": len(regions),
|
|
}
|