diff --git a/admin-v2/components/ocr/GroundTruthPanel.tsx b/admin-v2/components/ocr/GroundTruthPanel.tsx index e531c3d..1bf8068 100644 --- a/admin-v2/components/ocr/GroundTruthPanel.tsx +++ b/admin-v2/components/ocr/GroundTruthPanel.tsx @@ -111,6 +111,9 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou const [imageNatural, setImageNatural] = useState({ w: 0, h: 0 }) const [showSummary, setShowSummary] = useState(false) const [savedMessage, setSavedMessage] = useState(null) + const [isFullscreen, setIsFullscreen] = useState(false) + const [imageUrl, setImageUrl] = useState(pageImageUrl) + const [deskewAngle, setDeskewAngle] = useState(null) // Editable fields for current entry const [editEn, setEditEn] = useState('') @@ -120,13 +123,19 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou const panelRef = useRef(null) const enInputRef = useRef(null) + // Reset image URL when page changes + useEffect(() => { + setImageUrl(pageImageUrl) + setDeskewAngle(null) + }, [pageImageUrl]) + // Load natural image dimensions useEffect(() => { - if (!pageImageUrl) return + if (!imageUrl) return const img = new Image() img.onload = () => setImageNatural({ w: img.naturalWidth, h: img.naturalHeight }) - img.src = pageImageUrl - }, [pageImageUrl]) + img.src = imageUrl + }, [imageUrl]) // Sync edit fields when current entry changes useEffect(() => { @@ -157,6 +166,12 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou const loaded: GTEntry[] = (data.entries || []).map((e: GTEntry) => ({ ...e, status: 'pending' as const })) setEntries(loaded) setCurrentIndex(0) + + // Switch to deskewed image if available + if (data.deskewed) { + setImageUrl(`${KLAUSUR_API}/api/v1/vocab/sessions/${sessionId}/deskewed-image/${selectedPage}`) + setDeskewAngle(data.deskew_angle) + } } catch (err) { setError(err instanceof Error ? err.message : 'Extraction failed') } finally { @@ -225,9 +240,15 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou // ---------- Keyboard shortcuts ---------- useEffect(() => { - if (entries.length === 0 || showSummary) return - const handler = (e: KeyboardEvent) => { + if (e.key === 'Escape' && isFullscreen) { + e.preventDefault() + setIsFullscreen(false) + return + } + + if (entries.length === 0 || showSummary) return + // Don't capture when typing in inputs const tag = (e.target as HTMLElement)?.tagName const isInput = tag === 'INPUT' || tag === 'TEXTAREA' @@ -251,7 +272,7 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou window.addEventListener('keydown', handler) return () => window.removeEventListener('keydown', handler) - }, [entries.length, showSummary, confirmEntry, skipEntry, goTo, currentIndex]) + }, [entries.length, showSummary, isFullscreen, confirmEntry, skipEntry, goTo, currentIndex]) // ---------- Computed ---------- @@ -298,8 +319,27 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou if (showSummary) { return ( -
-

Zusammenfassung

+
+
+

Zusammenfassung

+ +
{confirmedCount}
@@ -385,22 +425,47 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou // ---------- Render: Main Review UI ---------- return ( -
- {/* Progress bar */} -
-
+
+ {/* Header with progress + fullscreen toggle */} +
+
+
+
+ {currentIndex + 1}/{entries.length} + {deskewAngle !== null && ( + + {deskewAngle.toFixed(1)}° + + )} +
-
+
{/* Left: Page image with SVG overlay (2/3) */} -
+
- {pageImageUrl && ( + {imageUrl && ( {`Seite {/* Right: Crops + Edit fields (1/3) */} -
+
{currentEntry && ( <> {/* Row crop */} {imageNatural.w > 0 && ( {currentEntry.bbox_en.w > 0 && ( 0 && ( 0 && ( - Enter = Bestaetigen · Tab = Ueberspringen · ←→ = Navigieren + Enter = Bestaetigen · Tab = Ueberspringen · ←→ = Navigieren{isFullscreen ? ' \u00B7 Esc = Vollbild verlassen' : ''}
)} diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 8ef1304..75af8b4 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -193,6 +193,127 @@ def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]: return corrected, median_angle +def deskew_image_by_word_alignment( + image_data: bytes, + lang: str = "eng+deu", + downscale_factor: float = 0.5, +) -> Tuple[bytes, float]: + """Correct rotation by fitting a line through left-most word starts per text line. + + More robust than Hough-based deskew for vocabulary worksheets where text lines + have consistent left-alignment. Runs a quick Tesseract pass on a downscaled + copy to find word positions, computes the dominant left-edge column, fits a + line through those points and rotates the full-resolution image. + + Args: + image_data: Raw image bytes (PNG/JPEG). + lang: Tesseract language string for the quick pass. + downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%). + + Returns: + Tuple of (rotated image as PNG bytes, detected angle in degrees). + """ + if not CV2_AVAILABLE or not TESSERACT_AVAILABLE: + return image_data, 0.0 + + # 1. Decode image + img_array = np.frombuffer(image_data, dtype=np.uint8) + img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + if img is None: + logger.warning("deskew_by_word_alignment: could not decode image") + return image_data, 0.0 + + orig_h, orig_w = img.shape[:2] + + # 2. Downscale for fast Tesseract pass + small_w = int(orig_w * downscale_factor) + small_h = int(orig_h * downscale_factor) + small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA) + + # 3. Quick Tesseract — word-level positions + pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB)) + try: + data = pytesseract.image_to_data( + pil_small, lang=lang, config="--psm 6 --oem 3", + output_type=pytesseract.Output.DICT, + ) + except Exception as e: + logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}") + return image_data, 0.0 + + # 4. Per text-line, find the left-most word start + # Group by (block_num, par_num, line_num) + from collections import defaultdict + line_groups: Dict[tuple, list] = defaultdict(list) + for i in range(len(data["text"])): + text = (data["text"][i] or "").strip() + conf = int(data["conf"][i]) + if not text or conf < 20: + continue + key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) + line_groups[key].append(i) + + if len(line_groups) < 5: + logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping") + return image_data, 0.0 + + # For each line, pick the word with smallest 'left' → compute (left_x, center_y) + # Scale back to original resolution + scale = 1.0 / downscale_factor + points = [] # list of (x, y) in original-image coords + for key, indices in line_groups.items(): + best_idx = min(indices, key=lambda i: data["left"][i]) + lx = data["left"][best_idx] * scale + top = data["top"][best_idx] * scale + h = data["height"][best_idx] * scale + cy = top + h / 2.0 + points.append((lx, cy)) + + # 5. Find dominant left-edge column + compute angle + xs = np.array([p[0] for p in points]) + ys = np.array([p[1] for p in points]) + median_x = float(np.median(xs)) + tolerance = orig_w * 0.03 # 3% of image width + + mask = np.abs(xs - median_x) <= tolerance + filtered_xs = xs[mask] + filtered_ys = ys[mask] + + if len(filtered_xs) < 5: + logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping") + return image_data, 0.0 + + # polyfit: x = a*y + b → a = dx/dy → angle = arctan(a) + coeffs = np.polyfit(filtered_ys, filtered_xs, 1) + slope = coeffs[0] # dx/dy + angle_rad = np.arctan(slope) + angle_deg = float(np.degrees(angle_rad)) + + # Clamp to ±5° + angle_deg = max(-5.0, min(5.0, angle_deg)) + + logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points " + f"(total lines: {len(line_groups)})") + + if abs(angle_deg) < 0.05: + return image_data, 0.0 + + # 6. Rotate full-res image + center = (orig_w // 2, orig_h // 2) + M = cv2.getRotationMatrix2D(center, angle_deg, 1.0) + rotated = cv2.warpAffine(img, M, (orig_w, orig_h), + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_REPLICATE) + + # Encode back to PNG + success, png_buf = cv2.imencode(".png", rotated) + if not success: + logger.warning("deskew_by_word_alignment: PNG encoding failed") + return image_data, 0.0 + + return png_buf.tobytes(), angle_deg + + # ============================================================================= # Stage 3: Dewarp (Book Curvature) — Pass-Through for now # ============================================================================= diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 495c6b9..29b4c69 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -2134,7 +2134,22 @@ async def extract_with_boxes(session_id: str, page_number: int): # Convert page to hires image image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) - # Extract entries with boxes + # Deskew image before OCR + deskew_angle = 0.0 + try: + from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE + if CV2_AVAILABLE: + image_data, deskew_angle = deskew_image_by_word_alignment(image_data) + logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}") + except Exception as e: + logger.warning(f"Deskew failed for page {page_number}: {e}") + + # Cache deskewed image in session for later serving + if "deskewed_images" not in session: + session["deskewed_images"] = {} + session["deskewed_images"][str(page_number)] = image_data + + # Extract entries with boxes (now on deskewed image) result = await extract_entries_with_boxes(image_data) # Cache in session @@ -2148,9 +2163,35 @@ async def extract_with_boxes(session_id: str, page_number: int): "entry_count": len(result["entries"]), "image_width": result["image_width"], "image_height": result["image_height"], + "deskew_angle": round(deskew_angle, 2), + "deskewed": abs(deskew_angle) > 0.05, } +@router.get("/sessions/{session_id}/deskewed-image/{page_number}") +async def get_deskewed_image(session_id: str, page_number: int): + """Return the deskewed page image as PNG. + + Falls back to the original hires image if no deskewed version is cached. + """ + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + + session = _sessions[session_id] + deskewed = session.get("deskewed_images", {}).get(str(page_number)) + + if deskewed: + return StreamingResponse(io.BytesIO(deskewed), media_type="image/png") + + # Fallback: render original hires image + pdf_data = session.get("pdf_data") + if not pdf_data: + raise HTTPException(status_code=400, detail="No PDF uploaded for this session") + + image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) + return StreamingResponse(io.BytesIO(image_data), media_type="image/png") + + @router.post("/sessions/{session_id}/ground-truth/{page_number}") async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)): """Save ground truth labels for a page.