feat: OCR pipeline step 8 — validation view with image detection & generation

Replaces the stub StepGroundTruth with a full side-by-side Original vs Reconstruction view. Adds VLM-based image region detection (qwen2.5vl), mflux image generation proxy, sync scroll/zoom, manual region drawing, and score/notes persistence. New backend endpoints: detect-images, generate-image, validate, get validation. New standalone mflux-service (scripts/mflux-service.py) for Metal GPU generation. Dockerfile.base: adds fonts-liberation (Apache-2.0). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 10:40:37 +01:00
parent 293e7914d8
commit 1cc69d6b5e
7 changed files with 1284 additions and 69 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -2238,6 +2238,271 @@ async def export_reconstruction_docx(session_id: str):
        raise HTTPException(status_code=501, detail="python-docx not installed")


+# ---------------------------------------------------------------------------
+# Step 8: Validation — Original vs. Reconstruction
+# ---------------------------------------------------------------------------
+
+STYLE_SUFFIXES = {
+    "educational": "educational illustration, textbook style, clear, colorful",
+    "cartoon": "cartoon, child-friendly, simple shapes",
+    "sketch": "pencil sketch, hand-drawn, black and white",
+    "clipart": "clipart, flat vector style, simple",
+    "realistic": "photorealistic, high detail",
+}
+
+
+class ValidationRequest(BaseModel):
+    notes: Optional[str] = None
+    score: Optional[int] = None
+
+
+class GenerateImageRequest(BaseModel):
+    region_index: int
+    prompt: str
+    style: str = "educational"
+
+
+@router.post("/sessions/{session_id}/reconstruction/detect-images")
+async def detect_image_regions(session_id: str):
+    """Detect illustration/image regions in the original scan using VLM.
+
+    Sends the original image to qwen2.5vl to find non-text, non-table
+    image areas, returning bounding boxes (in %) and descriptions.
+    """
+    import base64
+    import httpx
+    import re
+
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    # Get original image bytes
+    original_png = await get_session_image(session_id, "original")
+    if not original_png:
+        raise HTTPException(status_code=400, detail="No original image found")
+
+    # Build context from vocab entries for richer descriptions
+    word_result = session.get("word_result") or {}
+    entries = word_result.get("vocab_entries") or word_result.get("entries") or []
+    vocab_context = ""
+    if entries:
+        sample = entries[:10]
+        words = [f"{e.get('english', '')} / {e.get('german', '')}" for e in sample if e.get('english')]
+        if words:
+            vocab_context = f"\nContext: This is a vocabulary page with words like: {', '.join(words)}"
+
+    ollama_base = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+    model = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
+
+    prompt = (
+        "Analyze this scanned page. Find ALL illustration/image/picture regions "
+        "(NOT text, NOT table cells, NOT blank areas). "
+        "For each image region found, return its bounding box as percentage of page dimensions "
+        "and a short English description of what the image shows. "
+        "Reply with ONLY a JSON array like: "
+        '[{"x": 10, "y": 20, "w": 30, "h": 25, "description": "drawing of a cat"}] '
+        "where x, y, w, h are percentages (0-100) of the page width/height. "
+        "If there are NO images on the page, return an empty array: []"
+        f"{vocab_context}"
+    )
+
+    img_b64 = base64.b64encode(original_png).decode("utf-8")
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "images": [img_b64],
+        "stream": False,
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(f"{ollama_base}/api/generate", json=payload)
+            resp.raise_for_status()
+            text = resp.json().get("response", "")
+
+        # Parse JSON array from response
+        match = re.search(r'\[.*?\]', text, re.DOTALL)
+        if match:
+            raw_regions = json.loads(match.group(0))
+        else:
+            raw_regions = []
+
+        # Normalize to ImageRegion format
+        regions = []
+        for r in raw_regions:
+            regions.append({
+                "bbox_pct": {
+                    "x": max(0, min(100, float(r.get("x", 0)))),
+                    "y": max(0, min(100, float(r.get("y", 0)))),
+                    "w": max(1, min(100, float(r.get("w", 10)))),
+                    "h": max(1, min(100, float(r.get("h", 10)))),
+                },
+                "description": r.get("description", ""),
+                "prompt": r.get("description", ""),
+                "image_b64": None,
+                "style": "educational",
+            })
+
+        # Enrich prompts with nearby vocab context
+        if entries:
+            for region in regions:
+                ry = region["bbox_pct"]["y"]
+                rh = region["bbox_pct"]["h"]
+                nearby = [
+                    e for e in entries
+                    if e.get("bbox") and abs(e["bbox"].get("y", 0) - ry) < rh + 10
+                ]
+                if nearby:
+                    en_words = [e.get("english", "") for e in nearby if e.get("english")]
+                    de_words = [e.get("german", "") for e in nearby if e.get("german")]
+                    if en_words or de_words:
+                        context = f" (vocabulary context: {', '.join(en_words[:5])}"
+                        if de_words:
+                            context += f" / {', '.join(de_words[:5])}"
+                        context += ")"
+                        region["prompt"] = region["description"] + context
+
+        # Save to ground_truth JSONB
+        ground_truth = session.get("ground_truth") or {}
+        validation = ground_truth.get("validation") or {}
+        validation["image_regions"] = regions
+        validation["detected_at"] = datetime.utcnow().isoformat()
+        ground_truth["validation"] = validation
+        await update_session_db(session_id, ground_truth=ground_truth)
+
+        if session_id in _cache:
+            _cache[session_id]["ground_truth"] = ground_truth
+
+        logger.info(f"Detected {len(regions)} image regions for session {session_id}")
+
+        return {"regions": regions, "count": len(regions)}
+
+    except httpx.ConnectError:
+        logger.warning(f"VLM not available at {ollama_base} for image detection")
+        return {"regions": [], "count": 0, "error": "VLM not available"}
+    except Exception as e:
+        logger.error(f"Image detection failed for {session_id}: {e}")
+        return {"regions": [], "count": 0, "error": str(e)}
+
+
+@router.post("/sessions/{session_id}/reconstruction/generate-image")
+async def generate_image_for_region(session_id: str, req: GenerateImageRequest):
+    """Generate a replacement image for a detected region using mflux.
+
+    Sends the prompt (with style suffix) to the mflux-service running
+    natively on the Mac Mini (Metal GPU required).
+    """
+    import httpx
+
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    ground_truth = session.get("ground_truth") or {}
+    validation = ground_truth.get("validation") or {}
+    regions = validation.get("image_regions") or []
+
+    if req.region_index < 0 or req.region_index >= len(regions):
+        raise HTTPException(status_code=400, detail=f"Invalid region_index {req.region_index}, have {len(regions)} regions")
+
+    mflux_url = os.getenv("MFLUX_URL", "http://host.docker.internal:8095")
+    style_suffix = STYLE_SUFFIXES.get(req.style, STYLE_SUFFIXES["educational"])
+    full_prompt = f"{req.prompt}, {style_suffix}"
+
+    # Determine image size from region aspect ratio (snap to multiples of 64)
+    region = regions[req.region_index]
+    bbox = region["bbox_pct"]
+    aspect = bbox["w"] / max(bbox["h"], 1)
+    if aspect > 1.3:
+        width, height = 768, 512
+    elif aspect < 0.7:
+        width, height = 512, 768
+    else:
+        width, height = 512, 512
+
+    try:
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            resp = await client.post(f"{mflux_url}/generate", json={
+                "prompt": full_prompt,
+                "width": width,
+                "height": height,
+                "steps": 4,
+            })
+            resp.raise_for_status()
+            data = resp.json()
+            image_b64 = data.get("image_b64")
+
+        if not image_b64:
+            return {"image_b64": None, "success": False, "error": "No image returned"}
+
+        # Save to ground_truth
+        regions[req.region_index]["image_b64"] = image_b64
+        regions[req.region_index]["prompt"] = req.prompt
+        regions[req.region_index]["style"] = req.style
+        validation["image_regions"] = regions
+        ground_truth["validation"] = validation
+        await update_session_db(session_id, ground_truth=ground_truth)
+
+        if session_id in _cache:
+            _cache[session_id]["ground_truth"] = ground_truth
+
+        logger.info(f"Generated image for session {session_id} region {req.region_index}")
+        return {"image_b64": image_b64, "success": True}
+
+    except httpx.ConnectError:
+        logger.warning(f"mflux-service not available at {mflux_url}")
+        return {"image_b64": None, "success": False, "error": f"mflux-service not available at {mflux_url}"}
+    except Exception as e:
+        logger.error(f"Image generation failed for {session_id}: {e}")
+        return {"image_b64": None, "success": False, "error": str(e)}
+
+
+@router.post("/sessions/{session_id}/reconstruction/validate")
+async def save_validation(session_id: str, req: ValidationRequest):
+    """Save final validation results for step 8.
+
+    Stores notes, score, and preserves any detected/generated image regions.
+    Sets current_step = 8 to mark pipeline as complete.
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    ground_truth = session.get("ground_truth") or {}
+    validation = ground_truth.get("validation") or {}
+    validation["validated_at"] = datetime.utcnow().isoformat()
+    validation["notes"] = req.notes
+    validation["score"] = req.score
+    ground_truth["validation"] = validation
+
+    await update_session_db(session_id, ground_truth=ground_truth, current_step=8)
+
+    if session_id in _cache:
+        _cache[session_id]["ground_truth"] = ground_truth
+
+    logger.info(f"Validation saved for session {session_id}: score={req.score}")
+
+    return {"session_id": session_id, "validation": validation}
+
+
+@router.get("/sessions/{session_id}/reconstruction/validation")
+async def get_validation(session_id: str):
+    """Retrieve saved validation data for step 8."""
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    ground_truth = session.get("ground_truth") or {}
+    validation = ground_truth.get("validation")
+
+    return {
+        "session_id": session_id,
+        "validation": validation,
+        "word_result": session.get("word_result"),
+    }
+
+
@router.post("/sessions/{session_id}/reprocess")
 async def reprocess_session(session_id: str, request: Request):
    """Re-run pipeline from a specific step, clearing downstream data.