feat: OCR pipeline step 8 — validation view with image detection & generation

Replaces the stub StepGroundTruth with a full side-by-side Original vs Reconstruction view. Adds VLM-based image region detection (qwen2.5vl), mflux image generation proxy, sync scroll/zoom, manual region drawing, and score/notes persistence. New backend endpoints: detect-images, generate-image, validate, get validation. New standalone mflux-service (scripts/mflux-service.py) for Metal GPU generation. Dockerfile.base: adds fonts-liberation (Apache-2.0). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 10:40:37 +01:00
parent 293e7914d8
commit 1cc69d6b5e
7 changed files with 1284 additions and 69 deletions
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""
+mflux-service — Standalone FastAPI wrapper for mflux image generation.
+
+Runs NATIVELY on Mac Mini (requires Metal GPU, not Docker).
+Generates images using Flux Schnell via the mflux library.
+
+Setup:
+    python3 -m venv ~/mflux-env
+    source ~/mflux-env/bin/activate
+    pip install mflux fastapi uvicorn
+
+Run:
+    source ~/mflux-env/bin/activate
+    python scripts/mflux-service.py
+
+Or as a background service:
+    nohup ~/mflux-env/bin/python scripts/mflux-service.py > /tmp/mflux-service.log 2>&1 &
+
+License: Apache-2.0
+"""
+
+import base64
+import io
+import logging
+import os
+import time
+from typing import Optional
+
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger("mflux-service")
+
+app = FastAPI(title="mflux Image Generation Service", version="1.0.0")
+
+# Lazy-loaded generator
+_flux = None
+
+
+def _get_flux():
+    """Lazy-load the Flux model on first use."""
+    global _flux
+    if _flux is None:
+        logger.info("Loading Flux Schnell model (first call, may download ~12 GB)...")
+        from mflux import Flux1
+
+        _flux = Flux1(
+            model_name="schnell",
+            quantize=8,
+        )
+        logger.info("Flux Schnell model loaded.")
+    return _flux
+
+
+class GenerateRequest(BaseModel):
+    prompt: str
+    width: int = 512
+    height: int = 512
+    steps: int = 4
+    seed: Optional[int] = None
+
+
+class GenerateResponse(BaseModel):
+    image_b64: Optional[str] = None
+    success: bool = True
+    error: Optional[str] = None
+    duration_ms: int = 0
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok", "model": "flux-schnell", "gpu": "metal"}
+
+
+@app.post("/generate", response_model=GenerateResponse)
+async def generate_image(req: GenerateRequest):
+    """Generate an image from a text prompt using Flux Schnell."""
+    t0 = time.time()
+
+    # Validate dimensions (must be multiples of 64 for Flux)
+    width = max(256, min(1024, (req.width // 64) * 64))
+    height = max(256, min(1024, (req.height // 64) * 64))
+
+    try:
+        from mflux import Config
+
+        flux = _get_flux()
+        image = flux.generate_image(
+            seed=req.seed or int(time.time()) % 2**31,
+            prompt=req.prompt,
+            config=Config(
+                num_inference_steps=req.steps,
+                height=height,
+                width=width,
+            ),
+        )
+
+        # Convert PIL image to base64
+        buf = io.BytesIO()
+        image.save(buf, format="PNG")
+        buf.seek(0)
+        img_b64 = "data:image/png;base64," + base64.b64encode(buf.read()).decode("utf-8")
+
+        duration_ms = int((time.time() - t0) * 1000)
+        logger.info(f"Generated {width}x{height} image in {duration_ms}ms: {req.prompt[:60]}...")
+
+        return GenerateResponse(image_b64=img_b64, success=True, duration_ms=duration_ms)
+
+    except Exception as e:
+        duration_ms = int((time.time() - t0) * 1000)
+        logger.error(f"Generation failed: {e}")
+        return GenerateResponse(image_b64=None, success=False, error=str(e), duration_ms=duration_ms)
+
+
+if __name__ == "__main__":
+    port = int(os.getenv("MFLUX_PORT", "8095"))
+    logger.info(f"Starting mflux-service on port {port}")
+    uvicorn.run(app, host="0.0.0.0", port=port)