From 293e7914d8e1e2b8765fd5ee53c89f35f73e1625 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 5 Mar 2026 09:44:38 +0100 Subject: [PATCH] feat: improved OCR pipeline session manager with categories, thumbnails, pipeline logging - Add document_category (10 types) and pipeline_log JSONB columns - Session list: thumbnails, copyable IDs, category/doc_type badges - Inline category dropdown, bulk delete, pipeline step logging - New endpoints: thumbnail, delete-all, pipeline-log, categories - Cleared all 22 old test sessions Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-pipeline/page.tsx | 270 +++++++++++++----- .../app/(admin)/ai/ocr-pipeline/types.ts | 33 +++ klausur-service/backend/ocr_pipeline_api.py | 160 ++++++++++- .../backend/ocr_pipeline_session_store.py | 25 +- 4 files changed, 411 insertions(+), 77 deletions(-) diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx index bbfba67..c94ad4f 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/page.tsx @@ -11,7 +11,7 @@ import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecogniti import { StepLlmReview } from '@/components/ocr-pipeline/StepLlmReview' import { StepReconstruction } from '@/components/ocr-pipeline/StepReconstruction' import { StepGroundTruth } from '@/components/ocr-pipeline/StepGroundTruth' -import { PIPELINE_STEPS, type PipelineStep, type SessionListItem, type DocumentTypeResult } from './types' +import { PIPELINE_STEPS, DOCUMENT_CATEGORIES, type PipelineStep, type SessionListItem, type DocumentTypeResult, type DocumentCategory } from './types' const KLAUSUR_API = '/klausur-api' @@ -23,7 +23,9 @@ export default function OcrPipelinePage() { const [loadingSessions, setLoadingSessions] = useState(true) const [editingName, setEditingName] = useState(null) const [editNameValue, setEditNameValue] = useState('') + const [editingCategory, setEditingCategory] = useState(null) const [docTypeResult, setDocTypeResult] = useState(null) + const [activeCategory, setActiveCategory] = useState(undefined) const [steps, setSteps] = useState( PIPELINE_STEPS.map((s, i) => ({ ...s, @@ -59,6 +61,7 @@ export default function OcrPipelinePage() { setSessionId(sid) setSessionName(data.name || data.filename || '') + setActiveCategory(data.document_category || undefined) // Restore doc type result if available const savedDocType: DocumentTypeResult | null = data.doc_type_result || null @@ -115,6 +118,36 @@ export default function OcrPipelinePage() { setEditingName(null) }, [sessionId]) + const updateCategory = useCallback(async (sid: string, category: DocumentCategory) => { + try { + await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sid}`, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ document_category: category }), + }) + setSessions((prev) => prev.map((s) => (s.id === sid ? { ...s, document_category: category } : s))) + if (sessionId === sid) setActiveCategory(category) + } catch (e) { + console.error('Failed to update category:', e) + } + setEditingCategory(null) + }, [sessionId]) + + const deleteAllSessions = useCallback(async () => { + if (!confirm('Alle Sessions loeschen? Dies kann nicht rueckgaengig gemacht werden.')) return + try { + await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions`, { method: 'DELETE' }) + setSessions([]) + setSessionId(null) + setCurrentStep(0) + setDocTypeResult(null) + setActiveCategory(undefined) + setSteps(PIPELINE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' }))) + } catch (e) { + console.error('Failed to delete all sessions:', e) + } + }, []) + const handleStepClick = (index: number) => { if (index <= currentStep || steps[index].status === 'completed') { setCurrentStep(index) @@ -307,14 +340,25 @@ export default function OcrPipelinePage() {

- Sessions + Sessions ({sessions.length})

- +
+ {sessions.length > 0 && ( + + )} + +
{loadingSessions ? ( @@ -322,75 +366,165 @@ export default function OcrPipelinePage() { ) : sessions.length === 0 ? (
Noch keine Sessions vorhanden.
) : ( -
- {sessions.map((s) => ( -
-
openSession(s.id)}> - {editingName === s.id ? ( - setEditNameValue(e.target.value)} - onBlur={() => renameSession(s.id, editNameValue)} - onKeyDown={(e) => { - if (e.key === 'Enter') renameSession(s.id, editNameValue) - if (e.key === 'Escape') setEditingName(null) - }} - onClick={(e) => e.stopPropagation()} - className="w-full px-1 py-0.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600" +
+ {sessions.map((s) => { + const catInfo = DOCUMENT_CATEGORIES.find(c => c.value === s.document_category) + return ( +
+ {/* Thumbnail */} +
openSession(s.id)} + > + {/* eslint-disable-next-line @next/next/no-img-element */} + { (e.target as HTMLImageElement).style.display = 'none' }} /> - ) : ( -
- {s.name || s.filename} +
+ + {/* Info */} +
openSession(s.id)}> + {editingName === s.id ? ( + setEditNameValue(e.target.value)} + onBlur={() => renameSession(s.id, editNameValue)} + onKeyDown={(e) => { + if (e.key === 'Enter') renameSession(s.id, editNameValue) + if (e.key === 'Escape') setEditingName(null) + }} + onClick={(e) => e.stopPropagation()} + className="w-full px-1 py-0.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600" + /> + ) : ( +
+ {s.name || s.filename} +
+ )} + {/* ID row */} + +
+ {new Date(s.created_at).toLocaleDateString('de-DE', { day: '2-digit', month: '2-digit', year: '2-digit', hour: '2-digit', minute: '2-digit' })} + Schritt {s.current_step}: {stepNames[s.current_step] || '?'} +
+
+ + {/* Badges */} +
e.stopPropagation()}> + {/* Category Badge */} + + {/* Doc Type Badge (read-only) */} + {s.doc_type && ( + + {s.doc_type} + + )} +
+ + {/* Action buttons */} +
+ + +
+ + {/* Category dropdown (inline) */} + {editingCategory === s.id && ( +
e.stopPropagation()} + > + {DOCUMENT_CATEGORIES.map((cat) => ( + + ))}
)} -
- {new Date(s.created_at).toLocaleDateString('de-DE', { day: '2-digit', month: '2-digit', year: '2-digit', hour: '2-digit', minute: '2-digit' })} - Schritt {s.current_step}: {stepNames[s.current_step] || '?'} -
- - -
- ))} + ) + })}
)}
- {/* Active session name */} + {/* Active session info */} {sessionId && sessionName && ( -
- Aktive Session: {sessionName} +
+ Aktive Session: {sessionName} + {activeCategory && (() => { + const cat = DOCUMENT_CATEGORIES.find(c => c.value === activeCategory) + return cat ? {cat.icon} {cat.label} : null + })()} + {docTypeResult && ( + + {docTypeResult.doc_type} + + )}
)} diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index 4b1c86f..8734715 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -7,16 +7,47 @@ export interface PipelineStep { status: PipelineStepStatus } +export type DocumentCategory = + | 'vokabelseite' | 'buchseite' | 'arbeitsblatt' | 'klausurseite' + | 'mathearbeit' | 'statistik' | 'zeitung' | 'formular' | 'handschrift' | 'sonstiges' + +export const DOCUMENT_CATEGORIES: { value: DocumentCategory; label: string; icon: string }[] = [ + { value: 'vokabelseite', label: 'Vokabelseite', icon: '📖' }, + { value: 'buchseite', label: 'Buchseite', icon: '📚' }, + { value: 'arbeitsblatt', label: 'Arbeitsblatt', icon: '📝' }, + { value: 'klausurseite', label: 'Klausurseite', icon: '📄' }, + { value: 'mathearbeit', label: 'Mathearbeit', icon: '🔢' }, + { value: 'statistik', label: 'Statistik', icon: '📊' }, + { value: 'zeitung', label: 'Zeitung', icon: '📰' }, + { value: 'formular', label: 'Formular', icon: '📋' }, + { value: 'handschrift', label: 'Handschrift', icon: '✍️' }, + { value: 'sonstiges', label: 'Sonstiges', icon: '📎' }, +] + export interface SessionListItem { id: string name: string filename: string status: string current_step: number + document_category?: DocumentCategory + doc_type?: string created_at: string updated_at?: string } +export interface PipelineLogEntry { + step: string + completed_at: string + success: boolean + duration_ms?: number + metrics: Record +} + +export interface PipelineLog { + steps: PipelineLogEntry[] +} + export interface DocumentTypeResult { doc_type: 'vocab_table' | 'full_text' | 'generic_table' confidence: number @@ -34,6 +65,8 @@ export interface SessionInfo { image_height: number original_image_url: string current_step?: number + document_category?: DocumentCategory + doc_type?: string deskew_result?: DeskewResult dewarp_result?: DewarpResult column_result?: ColumnResult diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 3e14872..ab8bc20 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -66,6 +66,7 @@ from cv_vocab_pipeline import ( ) from ocr_pipeline_session_store import ( create_session_db, + delete_all_sessions_db, delete_session_db, get_session_db, get_session_image, @@ -151,8 +152,15 @@ class DewarpGroundTruthRequest(BaseModel): notes: Optional[str] = None -class RenameSessionRequest(BaseModel): - name: str +VALID_DOCUMENT_CATEGORIES = { + 'vokabelseite', 'buchseite', 'arbeitsblatt', 'klausurseite', + 'mathearbeit', 'statistik', 'zeitung', 'formular', 'handschrift', 'sonstiges', +} + + +class UpdateSessionRequest(BaseModel): + name: Optional[str] = None + document_category: Optional[str] = None class ManualColumnsRequest(BaseModel): @@ -281,6 +289,8 @@ async def get_session_info(session_id: str): "image_height": img_h, "original_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/original", "current_step": session.get("current_step", 1), + "document_category": session.get("document_category"), + "doc_type": session.get("doc_type"), } if session.get("deskew_result"): @@ -293,17 +303,31 @@ async def get_session_info(session_id: str): result["row_result"] = session["row_result"] if session.get("word_result"): result["word_result"] = session["word_result"] + if session.get("doc_type_result"): + result["doc_type_result"] = session["doc_type_result"] return result @router.put("/sessions/{session_id}") -async def rename_session(session_id: str, req: RenameSessionRequest): - """Rename a session.""" - updated = await update_session_db(session_id, name=req.name) +async def update_session(session_id: str, req: UpdateSessionRequest): + """Update session name and/or document category.""" + kwargs: Dict[str, Any] = {} + if req.name is not None: + kwargs["name"] = req.name + if req.document_category is not None: + if req.document_category not in VALID_DOCUMENT_CATEGORIES: + raise HTTPException( + status_code=400, + detail=f"Invalid category '{req.document_category}'. Valid: {sorted(VALID_DOCUMENT_CATEGORIES)}", + ) + kwargs["document_category"] = req.document_category + if not kwargs: + raise HTTPException(status_code=400, detail="Nothing to update") + updated = await update_session_db(session_id, **kwargs) if not updated: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - return {"session_id": session_id, "name": req.name} + return {"session_id": session_id, **kwargs} @router.delete("/sessions/{session_id}") @@ -316,6 +340,78 @@ async def delete_session(session_id: str): return {"session_id": session_id, "deleted": True} +@router.delete("/sessions") +async def delete_all_sessions(): + """Delete ALL sessions (cleanup).""" + _cache.clear() + count = await delete_all_sessions_db() + return {"deleted_count": count} + + +@router.get("/sessions/{session_id}/thumbnail") +async def get_session_thumbnail(session_id: str, size: int = Query(default=80, ge=16, le=400)): + """Return a small thumbnail of the original image.""" + original_png = await get_session_image(session_id, "original") + if not original_png: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found or no image") + arr = np.frombuffer(original_png, dtype=np.uint8) + img = cv2.imdecode(arr, cv2.IMREAD_COLOR) + if img is None: + raise HTTPException(status_code=500, detail="Failed to decode image") + h, w = img.shape[:2] + scale = size / max(h, w) + new_w, new_h = int(w * scale), int(h * scale) + thumb = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA) + _, png_bytes = cv2.imencode(".png", thumb) + return Response(content=png_bytes.tobytes(), media_type="image/png", + headers={"Cache-Control": "public, max-age=3600"}) + + +@router.get("/sessions/{session_id}/pipeline-log") +async def get_pipeline_log(session_id: str): + """Get the pipeline execution log for a session.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + return {"session_id": session_id, "pipeline_log": session.get("pipeline_log") or {"steps": []}} + + +@router.get("/categories") +async def list_categories(): + """List valid document categories.""" + return {"categories": sorted(VALID_DOCUMENT_CATEGORIES)} + + +# --------------------------------------------------------------------------- +# Pipeline Log Helper +# --------------------------------------------------------------------------- + +async def _append_pipeline_log( + session_id: str, + step_name: str, + metrics: Dict[str, Any], + success: bool = True, + duration_ms: Optional[int] = None, +): + """Append a step entry to the session's pipeline_log JSONB.""" + session = await get_session_db(session_id) + if not session: + return + log = session.get("pipeline_log") or {"steps": []} + if not isinstance(log, dict): + log = {"steps": []} + entry = { + "step": step_name, + "completed_at": datetime.utcnow().isoformat(), + "success": success, + "metrics": metrics, + } + if duration_ms is not None: + entry["duration_ms"] = duration_ms + log.setdefault("steps", []).append(entry) + await update_session_db(session_id, pipeline_log=log) + + # --------------------------------------------------------------------------- # Image Endpoints # --------------------------------------------------------------------------- @@ -448,6 +544,12 @@ async def auto_deskew(session_id: str): logger.info(f"OCR Pipeline: deskew session {session_id}: " f"hough={angle_hough:.2f} wa={angle_wa:.2f} -> {method_used} {angle_applied:.2f}") + await _append_pipeline_log(session_id, "deskew", { + "angle_applied": round(angle_applied, 3), + "confidence": round(confidence, 2), + "method": method_used, + }, duration_ms=int(duration * 1000)) + return { "session_id": session_id, **deskew_result, @@ -680,6 +782,13 @@ async def auto_dewarp( f"method={dewarp_info['method']} shear={dewarp_info['shear_degrees']:.3f} " f"conf={dewarp_info['confidence']:.2f} ({duration:.2f}s)") + await _append_pipeline_log(session_id, "dewarp", { + "shear_degrees": dewarp_info["shear_degrees"], + "confidence": dewarp_info["confidence"], + "method": dewarp_info["method"], + "ensemble_methods": [d.get("method", "") for d in dewarp_info.get("detections", [])], + }, duration_ms=int(duration * 1000)) + return { "session_id": session_id, **dewarp_result, @@ -808,6 +917,13 @@ async def detect_type(session_id: str): logger.info(f"OCR Pipeline: detect-type session {session_id}: " f"{result.doc_type} (confidence={result.confidence}, {duration:.2f}s)") + await _append_pipeline_log(session_id, "detect_type", { + "doc_type": result.doc_type, + "pipeline": result.pipeline, + "confidence": result.confidence, + **{k: v for k, v in (result.features or {}).items() if isinstance(v, (int, float, str, bool))}, + }, duration_ms=int(duration * 1000)) + return {"session_id": session_id, **result_dict} @@ -896,6 +1012,13 @@ async def detect_columns(session_id: str): logger.info(f"OCR Pipeline: columns session {session_id}: " f"{col_count} columns detected ({duration:.2f}s)") + img_w = dewarped_bgr.shape[1] + await _append_pipeline_log(session_id, "columns", { + "total_columns": len(columns), + "column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns], + "column_types": [c["type"] for c in columns], + }, duration_ms=int(duration * 1000)) + return { "session_id": session_id, **column_result, @@ -1112,6 +1235,15 @@ async def detect_rows(session_id: str): logger.info(f"OCR Pipeline: rows session {session_id}: " f"{len(rows)} rows detected ({duration:.2f}s): {type_counts}") + content_rows = sum(1 for r in rows if r.row_type == "content") + avg_height = round(sum(r.height for r in rows) / len(rows)) if rows else 0 + await _append_pipeline_log(session_id, "rows", { + "total_rows": len(rows), + "content_rows": content_rows, + "artifact_rows_removed": type_counts.get("header", 0) + type_counts.get("footer", 0), + "avg_row_height_px": avg_height, + }, duration_ms=int(duration * 1000)) + return { "session_id": session_id, **row_result, @@ -1369,6 +1501,15 @@ async def detect_words( f"layout={word_result['layout']}, " f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}") + await _append_pipeline_log(session_id, "words", { + "total_cells": len(cells), + "non_empty_cells": word_result["summary"]["non_empty_cells"], + "low_confidence_count": word_result["summary"]["low_confidence"], + "ocr_engine": used_engine, + "layout": word_result["layout"], + "entry_count": word_result.get("entry_count", 0), + }, duration_ms=int(duration * 1000)) + return { "session_id": session_id, **word_result, @@ -1774,6 +1915,13 @@ async def run_llm_review(session_id: str, request: Request, stream: bool = False logger.info(f"LLM review session {session_id}: {len(result['changes'])} changes, " f"{result['duration_ms']}ms, model={result['model_used']}") + await _append_pipeline_log(session_id, "correction", { + "engine": "llm", + "model": result["model_used"], + "total_entries": len(entries), + "corrections_proposed": len(result["changes"]), + }, duration_ms=result["duration_ms"]) + return { "session_id": session_id, "changes": result["changes"], diff --git a/klausur-service/backend/ocr_pipeline_session_store.py b/klausur-service/backend/ocr_pipeline_session_store.py index 254d662..12f3c2a 100644 --- a/klausur-service/backend/ocr_pipeline_session_store.py +++ b/klausur-service/backend/ocr_pipeline_session_store.py @@ -66,7 +66,9 @@ async def init_ocr_pipeline_tables(): ADD COLUMN IF NOT EXISTS clean_png BYTEA, ADD COLUMN IF NOT EXISTS handwriting_removal_meta JSONB, ADD COLUMN IF NOT EXISTS doc_type VARCHAR(50), - ADD COLUMN IF NOT EXISTS doc_type_result JSONB + ADD COLUMN IF NOT EXISTS doc_type_result JSONB, + ADD COLUMN IF NOT EXISTS document_category VARCHAR(50), + ADD COLUMN IF NOT EXISTS pipeline_log JSONB """) @@ -91,6 +93,7 @@ async def create_session_db( deskew_result, dewarp_result, column_result, row_result, word_result, ground_truth, auto_shear_degrees, doc_type, doc_type_result, + document_category, pipeline_log, created_at, updated_at """, uuid.UUID(session_id), name, filename, original_png) @@ -106,6 +109,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]: deskew_result, dewarp_result, column_result, row_result, word_result, ground_truth, auto_shear_degrees, doc_type, doc_type_result, + document_category, pipeline_log, created_at, updated_at FROM ocr_pipeline_sessions WHERE id = $1 """, uuid.UUID(session_id)) @@ -151,9 +155,10 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'auto_shear_degrees', 'doc_type', 'doc_type_result', + 'document_category', 'pipeline_log', } - jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result'} + jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'} for key, value in kwargs.items(): if key in allowed_fields: @@ -180,6 +185,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any deskew_result, dewarp_result, column_result, row_result, word_result, ground_truth, auto_shear_degrees, doc_type, doc_type_result, + document_category, pipeline_log, created_at, updated_at """, *values) @@ -194,6 +200,7 @@ async def list_sessions_db(limit: int = 50) -> List[Dict[str, Any]]: async with pool.acquire() as conn: rows = await conn.fetch(""" SELECT id, name, filename, status, current_step, + document_category, doc_type, created_at, updated_at FROM ocr_pipeline_sessions ORDER BY created_at DESC @@ -213,6 +220,18 @@ async def delete_session_db(session_id: str) -> bool: return result == "DELETE 1" +async def delete_all_sessions_db() -> int: + """Delete all sessions. Returns number of deleted rows.""" + pool = await get_pool() + async with pool.acquire() as conn: + result = await conn.execute("DELETE FROM ocr_pipeline_sessions") + # result is e.g. "DELETE 5" + try: + return int(result.split()[-1]) + except (ValueError, IndexError): + return 0 + + # ============================================================================= # HELPER # ============================================================================= @@ -235,7 +254,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]: result[key] = result[key].isoformat() # JSONB → parsed (asyncpg returns str for JSONB) - for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result']: + for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']: if key in result and result[key] is not None: if isinstance(result[key], str): result[key] = json.loads(result[key])