Add exclude region marking in Structure step
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m47s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m47s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Users can now draw rectangles on the document image in the Structure Detection step to mark areas (e.g. header graphics, alphabet strips) that should be excluded from OCR results during grid building. - Backend: PUT/DELETE endpoints for exclude regions stored in structure_result - Backend: _build_grid_core() filters all words inside user-defined exclude regions - Frontend: Interactive rectangle drawing with visual overlay and delete buttons - Preserve exclude regions when re-running structure detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -792,13 +792,44 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
# page number ("64", "S. 12") and not real content.
|
||||
_filter_footer_words(all_words, img_h, logger, session_id)
|
||||
|
||||
# 2d. Filter words inside detected graphic/image regions
|
||||
# 2d. Filter words inside user-defined exclude regions (from Structure step).
|
||||
# These are explicitly marked by the user, so ALL words inside are removed
|
||||
# regardless of confidence.
|
||||
structure_result = session.get("structure_result")
|
||||
exclude_rects = []
|
||||
if structure_result:
|
||||
for er in structure_result.get("exclude_regions", []):
|
||||
exclude_rects.append({
|
||||
"x": er["x"], "y": er["y"],
|
||||
"w": er["w"], "h": er["h"],
|
||||
})
|
||||
if exclude_rects:
|
||||
before = len(all_words)
|
||||
filtered = []
|
||||
for w in all_words:
|
||||
w_cx = w["left"] + w.get("width", 0) / 2
|
||||
w_cy = w["top"] + w.get("height", 0) / 2
|
||||
inside = any(
|
||||
er["x"] <= w_cx <= er["x"] + er["w"]
|
||||
and er["y"] <= w_cy <= er["y"] + er["h"]
|
||||
for er in exclude_rects
|
||||
)
|
||||
if not inside:
|
||||
filtered.append(w)
|
||||
removed = before - len(filtered)
|
||||
if removed:
|
||||
all_words = filtered
|
||||
logger.info(
|
||||
"build-grid session %s: removed %d words inside %d user exclude region(s)",
|
||||
session_id, removed, len(exclude_rects),
|
||||
)
|
||||
|
||||
# 2e. Filter words inside detected graphic/image regions
|
||||
# Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
|
||||
# High-confidence words are real text even if they overlap a detected
|
||||
# graphic region (e.g. colored text that graphic detection couldn't
|
||||
# fully distinguish from an image).
|
||||
_GRAPHIC_CONF_THRESHOLD = 50 # keep words with conf >= 50
|
||||
structure_result = session.get("structure_result")
|
||||
graphic_rects = []
|
||||
if structure_result:
|
||||
for g in structure_result.get("graphics", []):
|
||||
|
||||
@@ -22,6 +22,7 @@ from typing import Any, Dict, List, Optional
|
||||
import cv2
|
||||
import numpy as np
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
|
||||
from cv_vocab_pipeline import (
|
||||
_apply_shear,
|
||||
@@ -712,6 +713,10 @@ async def detect_structure(session_id: str):
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
# Preserve user-drawn exclude regions from previous run
|
||||
prev_sr = cached.get("structure_result") or {}
|
||||
prev_exclude = prev_sr.get("exclude_regions", [])
|
||||
|
||||
result_dict = {
|
||||
"image_width": w,
|
||||
"image_height": h,
|
||||
@@ -749,6 +754,7 @@ async def detect_structure(session_id: str):
|
||||
}
|
||||
for g in graphics
|
||||
],
|
||||
"exclude_regions": prev_exclude,
|
||||
"color_pixel_counts": color_summary,
|
||||
"has_words": len(words) > 0,
|
||||
"word_count": len(words),
|
||||
@@ -766,6 +772,76 @@ async def detect_structure(session_id: str):
|
||||
return {"session_id": session_id, **result_dict}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exclude Regions — user-drawn rectangles to exclude from OCR results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _ExcludeRegionIn(BaseModel):
|
||||
x: int
|
||||
y: int
|
||||
w: int
|
||||
h: int
|
||||
label: str = ""
|
||||
|
||||
|
||||
class _ExcludeRegionsBatchIn(BaseModel):
|
||||
regions: list[_ExcludeRegionIn]
|
||||
|
||||
|
||||
@router.put("/sessions/{session_id}/exclude-regions")
|
||||
async def set_exclude_regions(session_id: str, body: _ExcludeRegionsBatchIn):
|
||||
"""Replace all exclude regions for a session.
|
||||
|
||||
Regions are stored inside ``structure_result.exclude_regions``.
|
||||
"""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
sr = session.get("structure_result") or {}
|
||||
sr["exclude_regions"] = [r.model_dump() for r in body.regions]
|
||||
|
||||
await update_session_db(session_id, structure_result=sr)
|
||||
|
||||
# Update cache
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["structure_result"] = sr
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"exclude_regions": sr["exclude_regions"],
|
||||
"count": len(sr["exclude_regions"]),
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/sessions/{session_id}/exclude-regions/{region_index}")
|
||||
async def delete_exclude_region(session_id: str, region_index: int):
|
||||
"""Remove a single exclude region by index."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
sr = session.get("structure_result") or {}
|
||||
regions = sr.get("exclude_regions", [])
|
||||
|
||||
if region_index < 0 or region_index >= len(regions):
|
||||
raise HTTPException(status_code=404, detail="Region index out of range")
|
||||
|
||||
removed = regions.pop(region_index)
|
||||
sr["exclude_regions"] = regions
|
||||
|
||||
await update_session_db(session_id, structure_result=sr)
|
||||
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["structure_result"] = sr
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"removed": removed,
|
||||
"remaining": len(regions),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Column Detection Endpoints (Step 3)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user