feat(ocr): Add Ground Truth labeling UI for OCR comparison

Adds a step-through tool for creating 100% correct reference data (ground truth)
with position information. Users scan a page, review each vocabulary entry with
image crops, confirm or correct the OCR text, and save the result as JSON.

Backend: extract_entries_with_boxes() helper + 3 endpoints (extract-with-boxes,
ground-truth save/load). Frontend: GroundTruthPanel component with SVG overlay,
ImageCrop, keyboard shortcuts (Enter/Tab/arrows), and tab navigation in page.tsx.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
BreakPilot Dev
2026-02-10 09:04:36 +01:00
parent d4a23e8d99
commit 8c77df494b
4 changed files with 872 additions and 3 deletions

View File

@@ -2001,3 +2001,227 @@ async def load_latest_ocr_export():
data = json.load(f)
return data
# =============================================================================
# Ground Truth Labeling
# =============================================================================
GROUND_TRUTH_DIR = os.path.join(LOCAL_STORAGE_PATH, "ground-truth")
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
Returns dict with 'entries' list and 'image_width'/'image_height'.
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
All bbox coordinates are in percent (0-100).
"""
if not TESSERACT_AVAILABLE:
raise HTTPException(status_code=500, detail="Tesseract not available")
if not GRID_SERVICE_AVAILABLE:
raise HTTPException(status_code=500, detail="GridDetectionService not available")
# Step 1: Tesseract word-level bounding boxes
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
words = tess_result.get("words", [])
img_w = tess_result.get("image_width", 0)
img_h = tess_result.get("image_height", 0)
if not words or img_w == 0 or img_h == 0:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 2: Convert to OCR regions (percentage-based)
service = GridDetectionService()
regions = service.convert_tesseract_regions(words, img_w, img_h)
if not regions:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 3: Detect grid
grid_result = service.detect_grid(regions)
if not grid_result.cells:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 4: Group cells by logical_row and column_type
from services.grid_detection_service import ColumnType
entries = []
for row_idx, row_cells in enumerate(grid_result.cells):
en_text = ""
de_text = ""
ex_text = ""
en_bbox = None
de_bbox = None
ex_bbox = None
row_conf_sum = 0.0
row_conf_count = 0
for cell in row_cells:
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
"w": round(cell.width, 2), "h": round(cell.height, 2)}
if cell.column_type == ColumnType.ENGLISH:
en_text = cell.text.strip()
en_bbox = cell_bbox
elif cell.column_type == ColumnType.GERMAN:
de_text = cell.text.strip()
de_bbox = cell_bbox
elif cell.column_type == ColumnType.EXAMPLE:
ex_text = cell.text.strip()
ex_bbox = cell_bbox
if cell.text.strip():
row_conf_sum += cell.confidence
row_conf_count += 1
# Skip completely empty rows
if not en_text and not de_text and not ex_text:
continue
# Calculate whole-row bounding box
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
if all_bboxes:
row_x = min(b["x"] for b in all_bboxes)
row_y = min(b["y"] for b in all_bboxes)
row_right = max(b["x"] + b["w"] for b in all_bboxes)
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
else:
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
entries.append({
"row_index": row_idx,
"english": en_text,
"german": de_text,
"example": ex_text,
"confidence": avg_conf,
"bbox": row_bbox,
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
})
return {"entries": entries, "image_width": img_w, "image_height": img_h}
@router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
async def extract_with_boxes(session_id: str, page_number: int):
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
Uses Tesseract + GridDetectionService for spatial positioning.
page_number is 0-indexed.
"""
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Convert page to hires image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
# Extract entries with boxes
result = await extract_entries_with_boxes(image_data)
# Cache in session
if "gt_entries" not in session:
session["gt_entries"] = {}
session["gt_entries"][str(page_number)] = result["entries"]
return {
"success": True,
"entries": result["entries"],
"entry_count": len(result["entries"]),
"image_width": result["image_width"],
"image_height": result["image_height"],
}
@router.post("/sessions/{session_id}/ground-truth/{page_number}")
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
"""Save ground truth labels for a page.
Expects body with 'entries' list - each entry has english, german, example,
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
"""
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
entries = data.get("entries", [])
if not entries:
raise HTTPException(status_code=400, detail="No entries provided")
# Save in session
session = _sessions[session_id]
if "ground_truth" not in session:
session["ground_truth"] = {}
session["ground_truth"][str(page_number)] = entries
# Also save to disk
os.makedirs(GROUND_TRUTH_DIR, exist_ok=True)
gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
gt_data = {
"session_id": session_id,
"page_number": page_number,
"saved_at": datetime.now().isoformat(),
"entry_count": len(entries),
"entries": entries,
}
with open(gt_path, 'w', encoding='utf-8') as f:
json.dump(gt_data, f, ensure_ascii=False, indent=2)
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
edited = sum(1 for e in entries if e.get("status") == "edited")
skipped = sum(1 for e in entries if e.get("status") == "skipped")
return {
"success": True,
"saved_count": len(entries),
"confirmed": confirmed,
"edited": edited,
"skipped": skipped,
"file_path": gt_path,
}
@router.get("/sessions/{session_id}/ground-truth/{page_number}")
async def load_ground_truth(session_id: str, page_number: int):
"""Load saved ground truth for a page."""
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
# Try session cache first
session = _sessions[session_id]
cached = session.get("ground_truth", {}).get(str(page_number))
if cached:
return {"success": True, "entries": cached, "source": "cache"}
# Try disk
gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
if not os.path.exists(gt_path):
raise HTTPException(status_code=404, detail="No ground truth found for this page")
with open(gt_path, 'r', encoding='utf-8') as f:
gt_data = json.load(f)
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}