feat: OCR pipeline step 8 — validation view with image detection & generation
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
Replaces the stub StepGroundTruth with a full side-by-side Original vs Reconstruction view. Adds VLM-based image region detection (qwen2.5vl), mflux image generation proxy, sync scroll/zoom, manual region drawing, and score/notes persistence. New backend endpoints: detect-images, generate-image, validate, get validation. New standalone mflux-service (scripts/mflux-service.py) for Metal GPU generation. Dockerfile.base: adds fonts-liberation (Apache-2.0). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2238,6 +2238,271 @@ async def export_reconstruction_docx(session_id: str):
|
||||
raise HTTPException(status_code=501, detail="python-docx not installed")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 8: Validation — Original vs. Reconstruction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
STYLE_SUFFIXES = {
|
||||
"educational": "educational illustration, textbook style, clear, colorful",
|
||||
"cartoon": "cartoon, child-friendly, simple shapes",
|
||||
"sketch": "pencil sketch, hand-drawn, black and white",
|
||||
"clipart": "clipart, flat vector style, simple",
|
||||
"realistic": "photorealistic, high detail",
|
||||
}
|
||||
|
||||
|
||||
class ValidationRequest(BaseModel):
|
||||
notes: Optional[str] = None
|
||||
score: Optional[int] = None
|
||||
|
||||
|
||||
class GenerateImageRequest(BaseModel):
|
||||
region_index: int
|
||||
prompt: str
|
||||
style: str = "educational"
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/reconstruction/detect-images")
|
||||
async def detect_image_regions(session_id: str):
|
||||
"""Detect illustration/image regions in the original scan using VLM.
|
||||
|
||||
Sends the original image to qwen2.5vl to find non-text, non-table
|
||||
image areas, returning bounding boxes (in %) and descriptions.
|
||||
"""
|
||||
import base64
|
||||
import httpx
|
||||
import re
|
||||
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
# Get original image bytes
|
||||
original_png = await get_session_image(session_id, "original")
|
||||
if not original_png:
|
||||
raise HTTPException(status_code=400, detail="No original image found")
|
||||
|
||||
# Build context from vocab entries for richer descriptions
|
||||
word_result = session.get("word_result") or {}
|
||||
entries = word_result.get("vocab_entries") or word_result.get("entries") or []
|
||||
vocab_context = ""
|
||||
if entries:
|
||||
sample = entries[:10]
|
||||
words = [f"{e.get('english', '')} / {e.get('german', '')}" for e in sample if e.get('english')]
|
||||
if words:
|
||||
vocab_context = f"\nContext: This is a vocabulary page with words like: {', '.join(words)}"
|
||||
|
||||
ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||
model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
|
||||
|
||||
prompt = (
|
||||
"Analyze this scanned page. Find ALL illustration/image/picture regions "
|
||||
"(NOT text, NOT table cells, NOT blank areas). "
|
||||
"For each image region found, return its bounding box as percentage of page dimensions "
|
||||
"and a short English description of what the image shows. "
|
||||
"Reply with ONLY a JSON array like: "
|
||||
'[{"x": 10, "y": 20, "w": 30, "h": 25, "description": "drawing of a cat"}] '
|
||||
"where x, y, w, h are percentages (0-100) of the page width/height. "
|
||||
"If there are NO images on the page, return an empty array: []"
|
||||
f"{vocab_context}"
|
||||
)
|
||||
|
||||
img_b64 = base64.b64encode(original_png).decode("utf-8")
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"images": [img_b64],
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(f"{ollama_base}/api/generate", json=payload)
|
||||
resp.raise_for_status()
|
||||
text = resp.json().get("response", "")
|
||||
|
||||
# Parse JSON array from response
|
||||
match = re.search(r'\[.*?\]', text, re.DOTALL)
|
||||
if match:
|
||||
raw_regions = json.loads(match.group(0))
|
||||
else:
|
||||
raw_regions = []
|
||||
|
||||
# Normalize to ImageRegion format
|
||||
regions = []
|
||||
for r in raw_regions:
|
||||
regions.append({
|
||||
"bbox_pct": {
|
||||
"x": max(0, min(100, float(r.get("x", 0)))),
|
||||
"y": max(0, min(100, float(r.get("y", 0)))),
|
||||
"w": max(1, min(100, float(r.get("w", 10)))),
|
||||
"h": max(1, min(100, float(r.get("h", 10)))),
|
||||
},
|
||||
"description": r.get("description", ""),
|
||||
"prompt": r.get("description", ""),
|
||||
"image_b64": None,
|
||||
"style": "educational",
|
||||
})
|
||||
|
||||
# Enrich prompts with nearby vocab context
|
||||
if entries:
|
||||
for region in regions:
|
||||
ry = region["bbox_pct"]["y"]
|
||||
rh = region["bbox_pct"]["h"]
|
||||
nearby = [
|
||||
e for e in entries
|
||||
if e.get("bbox") and abs(e["bbox"].get("y", 0) - ry) < rh + 10
|
||||
]
|
||||
if nearby:
|
||||
en_words = [e.get("english", "") for e in nearby if e.get("english")]
|
||||
de_words = [e.get("german", "") for e in nearby if e.get("german")]
|
||||
if en_words or de_words:
|
||||
context = f" (vocabulary context: {', '.join(en_words[:5])}"
|
||||
if de_words:
|
||||
context += f" / {', '.join(de_words[:5])}"
|
||||
context += ")"
|
||||
region["prompt"] = region["description"] + context
|
||||
|
||||
# Save to ground_truth JSONB
|
||||
ground_truth = session.get("ground_truth") or {}
|
||||
validation = ground_truth.get("validation") or {}
|
||||
validation["image_regions"] = regions
|
||||
validation["detected_at"] = datetime.utcnow().isoformat()
|
||||
ground_truth["validation"] = validation
|
||||
await update_session_db(session_id, ground_truth=ground_truth)
|
||||
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["ground_truth"] = ground_truth
|
||||
|
||||
logger.info(f"Detected {len(regions)} image regions for session {session_id}")
|
||||
|
||||
return {"regions": regions, "count": len(regions)}
|
||||
|
||||
except httpx.ConnectError:
|
||||
logger.warning(f"VLM not available at {ollama_base} for image detection")
|
||||
return {"regions": [], "count": 0, "error": "VLM not available"}
|
||||
except Exception as e:
|
||||
logger.error(f"Image detection failed for {session_id}: {e}")
|
||||
return {"regions": [], "count": 0, "error": str(e)}
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/reconstruction/generate-image")
|
||||
async def generate_image_for_region(session_id: str, req: GenerateImageRequest):
|
||||
"""Generate a replacement image for a detected region using mflux.
|
||||
|
||||
Sends the prompt (with style suffix) to the mflux-service running
|
||||
natively on the Mac Mini (Metal GPU required).
|
||||
"""
|
||||
import httpx
|
||||
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
ground_truth = session.get("ground_truth") or {}
|
||||
validation = ground_truth.get("validation") or {}
|
||||
regions = validation.get("image_regions") or []
|
||||
|
||||
if req.region_index < 0 or req.region_index >= len(regions):
|
||||
raise HTTPException(status_code=400, detail=f"Invalid region_index {req.region_index}, have {len(regions)} regions")
|
||||
|
||||
mflux_url = os.getenv("MFLUX_URL", "http://host.docker.internal:8095")
|
||||
style_suffix = STYLE_SUFFIXES.get(req.style, STYLE_SUFFIXES["educational"])
|
||||
full_prompt = f"{req.prompt}, {style_suffix}"
|
||||
|
||||
# Determine image size from region aspect ratio (snap to multiples of 64)
|
||||
region = regions[req.region_index]
|
||||
bbox = region["bbox_pct"]
|
||||
aspect = bbox["w"] / max(bbox["h"], 1)
|
||||
if aspect > 1.3:
|
||||
width, height = 768, 512
|
||||
elif aspect < 0.7:
|
||||
width, height = 512, 768
|
||||
else:
|
||||
width, height = 512, 512
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
resp = await client.post(f"{mflux_url}/generate", json={
|
||||
"prompt": full_prompt,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"steps": 4,
|
||||
})
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
image_b64 = data.get("image_b64")
|
||||
|
||||
if not image_b64:
|
||||
return {"image_b64": None, "success": False, "error": "No image returned"}
|
||||
|
||||
# Save to ground_truth
|
||||
regions[req.region_index]["image_b64"] = image_b64
|
||||
regions[req.region_index]["prompt"] = req.prompt
|
||||
regions[req.region_index]["style"] = req.style
|
||||
validation["image_regions"] = regions
|
||||
ground_truth["validation"] = validation
|
||||
await update_session_db(session_id, ground_truth=ground_truth)
|
||||
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["ground_truth"] = ground_truth
|
||||
|
||||
logger.info(f"Generated image for session {session_id} region {req.region_index}")
|
||||
return {"image_b64": image_b64, "success": True}
|
||||
|
||||
except httpx.ConnectError:
|
||||
logger.warning(f"mflux-service not available at {mflux_url}")
|
||||
return {"image_b64": None, "success": False, "error": f"mflux-service not available at {mflux_url}"}
|
||||
except Exception as e:
|
||||
logger.error(f"Image generation failed for {session_id}: {e}")
|
||||
return {"image_b64": None, "success": False, "error": str(e)}
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/reconstruction/validate")
|
||||
async def save_validation(session_id: str, req: ValidationRequest):
|
||||
"""Save final validation results for step 8.
|
||||
|
||||
Stores notes, score, and preserves any detected/generated image regions.
|
||||
Sets current_step = 8 to mark pipeline as complete.
|
||||
"""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
ground_truth = session.get("ground_truth") or {}
|
||||
validation = ground_truth.get("validation") or {}
|
||||
validation["validated_at"] = datetime.utcnow().isoformat()
|
||||
validation["notes"] = req.notes
|
||||
validation["score"] = req.score
|
||||
ground_truth["validation"] = validation
|
||||
|
||||
await update_session_db(session_id, ground_truth=ground_truth, current_step=8)
|
||||
|
||||
if session_id in _cache:
|
||||
_cache[session_id]["ground_truth"] = ground_truth
|
||||
|
||||
logger.info(f"Validation saved for session {session_id}: score={req.score}")
|
||||
|
||||
return {"session_id": session_id, "validation": validation}
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/reconstruction/validation")
|
||||
async def get_validation(session_id: str):
|
||||
"""Retrieve saved validation data for step 8."""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
ground_truth = session.get("ground_truth") or {}
|
||||
validation = ground_truth.get("validation")
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"validation": validation,
|
||||
"word_result": session.get("word_result"),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/reprocess")
|
||||
async def reprocess_session(session_id: str, request: Request):
|
||||
"""Re-run pipeline from a specific step, clearing downstream data.
|
||||
|
||||
Reference in New Issue
Block a user