Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
472
klausur-service/backend/vocab_worksheet_analysis_api.py
Normal file
472
klausur-service/backend/vocab_worksheet_analysis_api.py
Normal file
@@ -0,0 +1,472 @@
|
||||
"""
|
||||
Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
|
||||
extract-with-boxes, deskewed images, and learning unit generation.
|
||||
|
||||
The two large handlers (compare_ocr_methods, analyze_grid) live in
|
||||
vocab_worksheet_compare_api.py and are included via compare_router.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, Body, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Dict, Any
|
||||
from datetime import datetime
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
|
||||
def _get_sessions():
|
||||
from vocab_worksheet_api import _sessions
|
||||
return _sessions
|
||||
|
||||
def _get_local_storage_path():
|
||||
from vocab_worksheet_api import LOCAL_STORAGE_PATH
|
||||
return LOCAL_STORAGE_PATH
|
||||
from vocab_worksheet_generation import convert_pdf_page_to_image
|
||||
|
||||
# Try to import Tesseract extractor
|
||||
try:
|
||||
from tesseract_vocab_extractor import (
|
||||
extract_bounding_boxes, TESSERACT_AVAILABLE,
|
||||
)
|
||||
except ImportError:
|
||||
TESSERACT_AVAILABLE = False
|
||||
|
||||
# Try to import Grid Detection Service
|
||||
try:
|
||||
from services.grid_detection_service import GridDetectionService
|
||||
GRID_SERVICE_AVAILABLE = True
|
||||
except ImportError:
|
||||
GRID_SERVICE_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
analysis_router = APIRouter()
|
||||
|
||||
def _ocr_export_dir():
|
||||
return os.path.join(_get_local_storage_path(), "ocr-exports")
|
||||
|
||||
def _ground_truth_dir():
|
||||
return os.path.join(_get_local_storage_path(), "ground-truth")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OCR Export Endpoints (for cross-app OCR data sharing)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
|
||||
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
|
||||
"""
|
||||
Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
|
||||
|
||||
Both apps proxy to klausur-service via /klausur-api/, so this endpoint
|
||||
serves as shared storage accessible from both ports.
|
||||
"""
|
||||
|
||||
logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
|
||||
|
||||
os.makedirs(_ocr_export_dir(), exist_ok=True)
|
||||
|
||||
# Save the export data
|
||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||
with open(export_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Update latest pointer
|
||||
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
||||
with open(latest_path, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
"session_id": session_id,
|
||||
"page_number": page_number,
|
||||
"saved_at": datetime.utcnow().isoformat(),
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"session_id": session_id,
|
||||
"page_number": page_number,
|
||||
"message": "OCR export saved successfully",
|
||||
}
|
||||
|
||||
|
||||
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
|
||||
async def load_ocr_export(session_id: str, page_number: int):
|
||||
"""Load a specific OCR export by session and page number."""
|
||||
|
||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||
|
||||
if not os.path.exists(export_path):
|
||||
raise HTTPException(status_code=404, detail="OCR export not found")
|
||||
|
||||
with open(export_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@analysis_router.get("/ocr-export/latest")
|
||||
async def load_latest_ocr_export():
|
||||
"""Load the most recently saved OCR export data."""
|
||||
|
||||
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
||||
|
||||
if not os.path.exists(latest_path):
|
||||
raise HTTPException(status_code=404, detail="No OCR exports found")
|
||||
|
||||
with open(latest_path, 'r', encoding='utf-8') as f:
|
||||
pointer = json.load(f)
|
||||
|
||||
session_id = pointer.get("session_id")
|
||||
page_number = pointer.get("page_number")
|
||||
|
||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||
|
||||
if not os.path.exists(export_path):
|
||||
raise HTTPException(status_code=404, detail="Latest OCR export file not found")
|
||||
|
||||
with open(export_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Extract with Boxes & Deskewed Image
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
|
||||
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
|
||||
|
||||
Returns dict with 'entries' list and 'image_width'/'image_height'.
|
||||
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
|
||||
All bbox coordinates are in percent (0-100).
|
||||
"""
|
||||
if not TESSERACT_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Tesseract not available")
|
||||
if not GRID_SERVICE_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="GridDetectionService not available")
|
||||
|
||||
# Step 1: Tesseract word-level bounding boxes
|
||||
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
|
||||
words = tess_result.get("words", [])
|
||||
img_w = tess_result.get("image_width", 0)
|
||||
img_h = tess_result.get("image_height", 0)
|
||||
|
||||
if not words or img_w == 0 or img_h == 0:
|
||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||
|
||||
# Step 2: Convert to OCR regions (percentage-based)
|
||||
service = GridDetectionService()
|
||||
regions = service.convert_tesseract_regions(words, img_w, img_h)
|
||||
|
||||
if not regions:
|
||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||
|
||||
# Step 3: Detect grid
|
||||
grid_result = service.detect_grid(regions)
|
||||
|
||||
if not grid_result.cells:
|
||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||
|
||||
# Step 4: Group cells by logical_row and column_type
|
||||
from services.grid_detection_service import ColumnType
|
||||
|
||||
entries = []
|
||||
for row_idx, row_cells in enumerate(grid_result.cells):
|
||||
en_text = ""
|
||||
de_text = ""
|
||||
ex_text = ""
|
||||
en_bbox = None
|
||||
de_bbox = None
|
||||
ex_bbox = None
|
||||
row_conf_sum = 0.0
|
||||
row_conf_count = 0
|
||||
|
||||
for cell in row_cells:
|
||||
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
|
||||
"w": round(cell.width, 2), "h": round(cell.height, 2)}
|
||||
|
||||
if cell.column_type == ColumnType.ENGLISH:
|
||||
en_text = cell.text.strip()
|
||||
en_bbox = cell_bbox
|
||||
elif cell.column_type == ColumnType.GERMAN:
|
||||
de_text = cell.text.strip()
|
||||
de_bbox = cell_bbox
|
||||
elif cell.column_type == ColumnType.EXAMPLE:
|
||||
ex_text = cell.text.strip()
|
||||
ex_bbox = cell_bbox
|
||||
|
||||
if cell.text.strip():
|
||||
row_conf_sum += cell.confidence
|
||||
row_conf_count += 1
|
||||
|
||||
# Skip completely empty rows
|
||||
if not en_text and not de_text and not ex_text:
|
||||
continue
|
||||
|
||||
# Calculate whole-row bounding box
|
||||
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
|
||||
if all_bboxes:
|
||||
row_x = min(b["x"] for b in all_bboxes)
|
||||
row_y = min(b["y"] for b in all_bboxes)
|
||||
row_right = max(b["x"] + b["w"] for b in all_bboxes)
|
||||
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
|
||||
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
|
||||
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
|
||||
else:
|
||||
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
|
||||
|
||||
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
|
||||
|
||||
entries.append({
|
||||
"row_index": row_idx,
|
||||
"english": en_text,
|
||||
"german": de_text,
|
||||
"example": ex_text,
|
||||
"confidence": avg_conf,
|
||||
"bbox": row_bbox,
|
||||
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||
})
|
||||
|
||||
return {"entries": entries, "image_width": img_w, "image_height": img_h}
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
|
||||
async def extract_with_boxes(session_id: str, page_number: int):
|
||||
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
|
||||
|
||||
Uses Tesseract + GridDetectionService for spatial positioning.
|
||||
page_number is 0-indexed.
|
||||
"""
|
||||
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
# Convert page to hires image
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
|
||||
# Deskew image before OCR
|
||||
deskew_angle = 0.0
|
||||
try:
|
||||
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
|
||||
if CV2_AVAILABLE:
|
||||
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
|
||||
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Deskew failed for page {page_number}: {e}")
|
||||
|
||||
# Cache deskewed image in session for later serving
|
||||
if "deskewed_images" not in session:
|
||||
session["deskewed_images"] = {}
|
||||
session["deskewed_images"][str(page_number)] = image_data
|
||||
|
||||
# Extract entries with boxes (now on deskewed image)
|
||||
result = await extract_entries_with_boxes(image_data)
|
||||
|
||||
# Cache in session
|
||||
if "gt_entries" not in session:
|
||||
session["gt_entries"] = {}
|
||||
session["gt_entries"][str(page_number)] = result["entries"]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"entries": result["entries"],
|
||||
"entry_count": len(result["entries"]),
|
||||
"image_width": result["image_width"],
|
||||
"image_height": result["image_height"],
|
||||
"deskew_angle": round(deskew_angle, 2),
|
||||
"deskewed": abs(deskew_angle) > 0.05,
|
||||
}
|
||||
|
||||
|
||||
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
|
||||
async def get_deskewed_image(session_id: str, page_number: int):
|
||||
"""Return the deskewed page image as PNG.
|
||||
|
||||
Falls back to the original hires image if no deskewed version is cached.
|
||||
"""
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
deskewed = session.get("deskewed_images", {}).get(str(page_number))
|
||||
|
||||
if deskewed:
|
||||
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
|
||||
|
||||
# Fallback: render original hires image
|
||||
pdf_data = session.get("pdf_data")
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Ground Truth Labeling
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
||||
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
||||
"""Save ground truth labels for a page.
|
||||
|
||||
Expects body with 'entries' list - each entry has english, german, example,
|
||||
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
|
||||
"""
|
||||
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
entries = data.get("entries", [])
|
||||
if not entries:
|
||||
raise HTTPException(status_code=400, detail="No entries provided")
|
||||
|
||||
# Save in session
|
||||
session = _get_sessions()[session_id]
|
||||
if "ground_truth" not in session:
|
||||
session["ground_truth"] = {}
|
||||
session["ground_truth"][str(page_number)] = entries
|
||||
|
||||
# Also save to disk
|
||||
os.makedirs(_ground_truth_dir(), exist_ok=True)
|
||||
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
||||
gt_data = {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number,
|
||||
"saved_at": datetime.now().isoformat(),
|
||||
"entry_count": len(entries),
|
||||
"entries": entries,
|
||||
}
|
||||
with open(gt_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(gt_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
|
||||
|
||||
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
|
||||
edited = sum(1 for e in entries if e.get("status") == "edited")
|
||||
skipped = sum(1 for e in entries if e.get("status") == "skipped")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"saved_count": len(entries),
|
||||
"confirmed": confirmed,
|
||||
"edited": edited,
|
||||
"skipped": skipped,
|
||||
"file_path": gt_path,
|
||||
}
|
||||
|
||||
|
||||
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
|
||||
async def load_ground_truth(session_id: str, page_number: int):
|
||||
"""Load saved ground truth for a page."""
|
||||
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
# Try session cache first
|
||||
session = _get_sessions()[session_id]
|
||||
cached = session.get("ground_truth", {}).get(str(page_number))
|
||||
if cached:
|
||||
return {"success": True, "entries": cached, "source": "cache"}
|
||||
|
||||
# Try disk
|
||||
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
||||
if not os.path.exists(gt_path):
|
||||
raise HTTPException(status_code=404, detail="No ground truth found for this page")
|
||||
|
||||
with open(gt_path, 'r', encoding='utf-8') as f:
|
||||
gt_data = json.load(f)
|
||||
|
||||
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
|
||||
|
||||
|
||||
# ─── Learning Module Generation ─────────────────────────────────────────────
|
||||
|
||||
|
||||
class GenerateLearningUnitRequest(BaseModel):
|
||||
grade: Optional[str] = None
|
||||
generate_modules: bool = True
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
|
||||
async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
|
||||
"""
|
||||
Create a Learning Unit from the vocabulary in this session.
|
||||
|
||||
1. Takes vocabulary from the session
|
||||
2. Creates a Learning Unit in backend-lehrer
|
||||
3. Optionally triggers MC/Cloze/QA generation
|
||||
|
||||
Returns the created unit info and generation status.
|
||||
"""
|
||||
if request is None:
|
||||
request = GenerateLearningUnitRequest()
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
vocabulary = session.get("vocabulary", [])
|
||||
|
||||
if not vocabulary:
|
||||
raise HTTPException(status_code=400, detail="No vocabulary in this session")
|
||||
|
||||
try:
|
||||
from vocab_learn_bridge import create_learning_unit, generate_learning_modules
|
||||
|
||||
# Step 1: Create Learning Unit
|
||||
result = await create_learning_unit(
|
||||
session_name=session["name"],
|
||||
vocabulary=vocabulary,
|
||||
grade=request.grade,
|
||||
)
|
||||
|
||||
# Step 2: Generate modules if requested
|
||||
if request.generate_modules:
|
||||
try:
|
||||
gen_result = await generate_learning_modules(
|
||||
unit_id=result["unit_id"],
|
||||
analysis_path=result["analysis_path"],
|
||||
)
|
||||
result["generation"] = gen_result
|
||||
except Exception as e:
|
||||
logger.warning(f"Module generation failed (unit created): {e}")
|
||||
result["generation"] = {"status": "error", "reason": str(e)}
|
||||
|
||||
return result
|
||||
|
||||
except ImportError:
|
||||
raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=502, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Include compare_ocr_methods & analyze_grid from companion module
|
||||
# =============================================================================
|
||||
|
||||
from vocab_worksheet_compare_api import compare_router # noqa: E402
|
||||
|
||||
analysis_router.include_router(compare_router)
|
||||
Reference in New Issue
Block a user