Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
_split_broad_columns() erkennt EN/DE-Gemisch in breiten Spalten via Word-Coverage-Analyse und trennt sie am groessten Luecken-Gap. Thumbnails und Page-Images werden serverseitig per fitz rotiert, Frontend laedt Thumbnails nach OCR-Processing neu. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2542 lines
93 KiB
Python
2542 lines
93 KiB
Python
"""
|
|
Vocabulary Worksheet API - Extract vocabulary from textbook pages and generate worksheets.
|
|
|
|
DATENSCHUTZ/PRIVACY:
|
|
- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
|
|
- Keine Daten werden an externe Server gesendet
|
|
- DSGVO-konform fuer Schulumgebungen
|
|
|
|
Workflow:
|
|
1. POST /sessions - Create a vocabulary extraction session
|
|
2. POST /sessions/{id}/upload - Upload textbook page image
|
|
3. GET /sessions/{id}/vocabulary - Get extracted vocabulary
|
|
4. PUT /sessions/{id}/vocabulary - Edit vocabulary (corrections)
|
|
5. POST /sessions/{id}/generate - Generate worksheet PDF
|
|
6. GET /worksheets/{id}/pdf - Download generated PDF
|
|
"""
|
|
|
|
from fastapi import APIRouter, Body, HTTPException, UploadFile, File, Form, Query
|
|
from fastapi.responses import StreamingResponse
|
|
from pydantic import BaseModel
|
|
from typing import Optional, List, Dict, Any
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
import uuid
|
|
import os
|
|
import io
|
|
import json
|
|
import base64
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Ollama Configuration - Direct call without external modules
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
|
|
|
|
# Try to import MinIO storage
|
|
try:
|
|
from minio_storage import upload_to_minio, get_from_minio
|
|
MINIO_AVAILABLE = True
|
|
except ImportError:
|
|
MINIO_AVAILABLE = False
|
|
logger.warning("MinIO storage not available, using local storage")
|
|
|
|
# Try to import Tesseract extractor
|
|
try:
|
|
from tesseract_vocab_extractor import (
|
|
extract_bounding_boxes, run_tesseract_pipeline,
|
|
match_positions_to_vocab, TESSERACT_AVAILABLE,
|
|
)
|
|
except ImportError:
|
|
TESSERACT_AVAILABLE = False
|
|
logger.warning("Tesseract extractor not available")
|
|
|
|
# Try to import CV Pipeline
|
|
try:
|
|
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
|
|
except ImportError:
|
|
CV_PIPELINE_AVAILABLE = False
|
|
logger.warning("CV vocab pipeline not available")
|
|
|
|
# Try to import OCR Pipeline functions (for process-single-page)
|
|
try:
|
|
import cv2
|
|
import numpy as np
|
|
from cv_vocab_pipeline import (
|
|
deskew_image, deskew_image_by_word_alignment, deskew_image_iterative,
|
|
deskew_two_pass,
|
|
dewarp_image, create_ocr_image,
|
|
detect_column_geometry, analyze_layout_by_words, analyze_layout, create_layout_image,
|
|
detect_row_geometry, build_cell_grid_v2,
|
|
_cells_to_vocab_entries, _detect_sub_columns, _detect_header_footer_gaps,
|
|
_split_broad_columns,
|
|
expand_narrow_columns, positional_column_regions, llm_review_entries,
|
|
detect_and_fix_orientation,
|
|
_fix_phonetic_brackets,
|
|
render_pdf_high_res,
|
|
PageRegion, RowGeometry,
|
|
)
|
|
from ocr_pipeline_session_store import (
|
|
create_session_db as create_pipeline_session_db,
|
|
update_session_db as update_pipeline_session_db,
|
|
)
|
|
OCR_PIPELINE_AVAILABLE = True
|
|
except ImportError as _ocr_pipe_err:
|
|
OCR_PIPELINE_AVAILABLE = False
|
|
logger.warning(f"OCR Pipeline functions not available: {_ocr_pipe_err}")
|
|
|
|
# Try to import Grid Detection Service
|
|
try:
|
|
from services.grid_detection_service import GridDetectionService
|
|
GRID_SERVICE_AVAILABLE = True
|
|
except ImportError:
|
|
GRID_SERVICE_AVAILABLE = False
|
|
logger.warning("Grid Detection Service not available")
|
|
|
|
# Database integration (used by main.py lifespan)
|
|
try:
|
|
from vocab_session_store import (
|
|
DATABASE_URL, get_pool, init_vocab_tables,
|
|
list_sessions_db, get_session_db,
|
|
)
|
|
except ImportError:
|
|
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
|
|
get_pool = None
|
|
init_vocab_tables = None
|
|
list_sessions_db = None
|
|
get_session_db = None
|
|
|
|
_db_pool = None
|
|
|
|
|
|
def set_db_pool(pool):
|
|
"""Set the database connection pool (called from main.py lifespan)."""
|
|
global _db_pool
|
|
_db_pool = pool
|
|
|
|
|
|
async def _init_vocab_table():
|
|
"""Initialize vocab tables in database."""
|
|
if init_vocab_tables:
|
|
try:
|
|
await init_vocab_tables()
|
|
logger.info("vocab_session_cache table ready")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to init vocab tables: {e}")
|
|
else:
|
|
logger.info("vocab_session_cache table ready")
|
|
|
|
|
|
async def _load_all_sessions():
|
|
"""Load all vocab sessions from database into memory cache."""
|
|
if not list_sessions_db:
|
|
logger.info("Loaded 0 vocab sessions from database")
|
|
return
|
|
|
|
try:
|
|
sessions = await list_sessions_db(limit=500)
|
|
count = 0
|
|
for s in sessions:
|
|
sid = s.get("id") or s.get("session_id")
|
|
if sid and sid not in _sessions:
|
|
_sessions[sid] = {
|
|
"id": sid,
|
|
"name": s.get("name", ""),
|
|
"description": s.get("description", ""),
|
|
"status": s.get("status", "created"),
|
|
"vocabulary_count": s.get("vocabulary_count", 0),
|
|
"source_language": s.get("source_language", "en"),
|
|
"target_language": s.get("target_language", "de"),
|
|
"created_at": str(s.get("created_at", "")),
|
|
}
|
|
count += 1
|
|
logger.info(f"Loaded {count} vocab sessions from database")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load sessions from database: {e}")
|
|
|
|
|
|
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
|
|
|
|
# Local storage path
|
|
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
|
|
|
|
|
# =============================================================================
|
|
# Enums and Pydantic Models
|
|
# =============================================================================
|
|
|
|
class WorksheetType(str, Enum):
|
|
EN_TO_DE = "en_to_de" # English -> German translation
|
|
DE_TO_EN = "de_to_en" # German -> English translation
|
|
COPY_PRACTICE = "copy" # Write word multiple times
|
|
GAP_FILL = "gap_fill" # Fill in the blanks
|
|
COMBINED = "combined" # All types combined
|
|
|
|
|
|
class SessionStatus(str, Enum):
|
|
PENDING = "pending" # Session created, no upload yet
|
|
PROCESSING = "processing" # OCR in progress
|
|
EXTRACTED = "extracted" # Vocabulary extracted, ready to edit
|
|
COMPLETED = "completed" # Worksheet generated
|
|
|
|
|
|
class VocabularyEntry(BaseModel):
|
|
id: str
|
|
english: str
|
|
german: str
|
|
example_sentence: Optional[str] = None
|
|
example_sentence_gap: Optional[str] = None # With ___ for gap-fill
|
|
word_type: Optional[str] = None # noun, verb, adjective, etc.
|
|
source_page: Optional[int] = None # Page number where entry was found (1-indexed)
|
|
|
|
|
|
class SessionCreate(BaseModel):
|
|
name: str
|
|
description: Optional[str] = None
|
|
source_language: str = "en" # Source language (default English)
|
|
target_language: str = "de" # Target language (default German)
|
|
|
|
|
|
class SessionResponse(BaseModel):
|
|
id: str
|
|
name: str
|
|
description: Optional[str]
|
|
source_language: str
|
|
target_language: str
|
|
status: str
|
|
vocabulary_count: int
|
|
image_path: Optional[str]
|
|
created_at: datetime
|
|
|
|
|
|
class VocabularyResponse(BaseModel):
|
|
session_id: str
|
|
vocabulary: List[VocabularyEntry]
|
|
extraction_confidence: Optional[float]
|
|
|
|
|
|
class VocabularyUpdate(BaseModel):
|
|
vocabulary: List[VocabularyEntry]
|
|
|
|
|
|
class WorksheetGenerateRequest(BaseModel):
|
|
worksheet_types: List[WorksheetType]
|
|
title: Optional[str] = None
|
|
include_solutions: bool = True
|
|
repetitions: int = 3 # For copy practice
|
|
line_height: str = "normal" # normal, large, extra-large
|
|
|
|
|
|
class WorksheetResponse(BaseModel):
|
|
id: str
|
|
session_id: str
|
|
worksheet_types: List[str]
|
|
pdf_path: str
|
|
solution_path: Optional[str]
|
|
generated_at: datetime
|
|
|
|
|
|
# =============================================================================
|
|
# In-Memory Storage (simplified - should use PostgreSQL in production)
|
|
# =============================================================================
|
|
|
|
# Session storage
|
|
_sessions: Dict[str, Dict[str, Any]] = {}
|
|
_worksheets: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
# =============================================================================
|
|
# Vision LLM Vocabulary Extraction
|
|
# =============================================================================
|
|
|
|
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
|
|
|
|
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
|
|
|
|
{
|
|
"vocabulary": [
|
|
{
|
|
"english": "to improve",
|
|
"german": "verbessern",
|
|
"example": "I want to improve my English."
|
|
}
|
|
]
|
|
}
|
|
|
|
REGELN:
|
|
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
|
|
2. Behalte die exakte Schreibweise bei
|
|
3. Bei fehlenden Beispielsaetzen: "example": null
|
|
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
|
|
5. Gib NUR valides JSON zurueck, keine Erklaerungen
|
|
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
|
|
|
|
Beispiel-Output:
|
|
{
|
|
"vocabulary": [
|
|
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
|
|
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
|
|
]
|
|
}"""
|
|
|
|
|
|
async def extract_vocabulary_from_image(
|
|
image_data: bytes,
|
|
filename: str,
|
|
page_number: int = 0,
|
|
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
|
|
) -> tuple[List[VocabularyEntry], float, str]:
|
|
"""
|
|
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
|
|
|
|
Args:
|
|
image_data: Image bytes
|
|
filename: Original filename for logging
|
|
page_number: 0-indexed page number for error messages
|
|
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
|
|
If False, use Vision LLM (slower, better for complex layouts)
|
|
|
|
Returns:
|
|
Tuple of (vocabulary_entries, confidence, error_message)
|
|
error_message is empty string on success
|
|
"""
|
|
import httpx
|
|
|
|
# ==========================================================================
|
|
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
|
|
# ==========================================================================
|
|
if use_hybrid:
|
|
try:
|
|
from hybrid_vocab_extractor import extract_vocabulary_hybrid
|
|
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
|
|
|
|
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
|
|
|
|
if error:
|
|
logger.warning(f"Hybrid extraction had issues: {error}")
|
|
# Fall through to Vision LLM fallback
|
|
elif vocab_dicts:
|
|
# Convert dicts to VocabularyEntry objects
|
|
vocabulary = [
|
|
VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=v.get("english", ""),
|
|
german=v.get("german", ""),
|
|
example_sentence=v.get("example"),
|
|
source_page=page_number + 1
|
|
)
|
|
for v in vocab_dicts
|
|
if v.get("english") and v.get("german")
|
|
]
|
|
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
|
|
return vocabulary, confidence, ""
|
|
|
|
except ImportError as e:
|
|
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
|
|
except Exception as e:
|
|
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
|
|
# ==========================================================================
|
|
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
|
|
# ==========================================================================
|
|
logger.info(f"Using VISION LLM extraction for {filename}")
|
|
|
|
try:
|
|
# First check if Ollama is available
|
|
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
|
try:
|
|
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
|
if health_response.status_code != 200:
|
|
logger.error(f"Ollama not available at {OLLAMA_URL}")
|
|
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
|
|
except Exception as e:
|
|
logger.error(f"Ollama health check failed: {e}")
|
|
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
|
|
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|
|
|
payload = {
|
|
"model": VISION_MODEL,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": VOCAB_EXTRACTION_PROMPT,
|
|
"images": [image_base64]
|
|
}
|
|
],
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"num_predict": 4096,
|
|
}
|
|
}
|
|
|
|
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
|
|
|
|
# Increased timeout for Vision models (they can be slow)
|
|
async with httpx.AsyncClient(timeout=600.0) as client:
|
|
response = await client.post(
|
|
f"{OLLAMA_URL}/api/chat",
|
|
json=payload,
|
|
timeout=300.0 # 5 minutes per page
|
|
)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
extracted_text = data.get("message", {}).get("content", "")
|
|
|
|
logger.info(f"Ollama response received: {len(extracted_text)} chars")
|
|
|
|
# Parse JSON from response
|
|
vocabulary = parse_vocabulary_json(extracted_text)
|
|
|
|
# Set source_page for each entry
|
|
for v in vocabulary:
|
|
v.source_page = page_number + 1
|
|
|
|
# Estimate confidence
|
|
confidence = 0.85 if len(vocabulary) > 0 else 0.1
|
|
|
|
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
|
|
|
|
return vocabulary, confidence, ""
|
|
|
|
except httpx.TimeoutException:
|
|
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
|
|
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
|
|
except Exception as e:
|
|
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
|
|
|
|
|
|
def _get_demo_vocabulary() -> List[VocabularyEntry]:
|
|
"""Return demo vocabulary for testing when Vision LLM is not available."""
|
|
demo_entries = [
|
|
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
|
|
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
|
|
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
|
|
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
|
|
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
|
|
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
|
|
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
|
|
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
|
|
]
|
|
return [
|
|
VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=e["english"],
|
|
german=e["german"],
|
|
example_sentence=e.get("example"),
|
|
)
|
|
for e in demo_entries
|
|
]
|
|
|
|
|
|
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
|
|
"""Parse vocabulary JSON from LLM response with robust error handling."""
|
|
import re
|
|
|
|
def clean_json_string(s: str) -> str:
|
|
"""Clean a JSON string by removing control characters and fixing common issues."""
|
|
# Remove control characters except newlines and tabs
|
|
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
|
# Replace unescaped newlines within strings with space
|
|
# This is a simplistic approach - replace actual newlines with escaped ones
|
|
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
|
return s
|
|
|
|
def try_parse_json(json_str: str) -> dict:
|
|
"""Try multiple strategies to parse JSON."""
|
|
# Strategy 1: Direct parse
|
|
try:
|
|
return json.loads(json_str)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Strategy 2: Clean and parse
|
|
try:
|
|
cleaned = clean_json_string(json_str)
|
|
return json.loads(cleaned)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Strategy 3: Try to fix common issues
|
|
try:
|
|
# Remove trailing commas before } or ]
|
|
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
|
# Fix unquoted keys
|
|
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
|
|
return json.loads(fixed)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return None
|
|
|
|
try:
|
|
# Find JSON in response (may have extra text)
|
|
start = text.find('{')
|
|
end = text.rfind('}') + 1
|
|
|
|
if start == -1 or end == 0:
|
|
logger.warning("No JSON found in response")
|
|
return []
|
|
|
|
json_str = text[start:end]
|
|
data = try_parse_json(json_str)
|
|
|
|
if data is None:
|
|
# Strategy 4: Extract vocabulary entries using regex as fallback
|
|
logger.warning("JSON parsing failed, trying regex extraction")
|
|
vocabulary = []
|
|
# Match patterns like {"english": "...", "german": "...", ...}
|
|
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
|
|
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
|
|
|
|
for match in matches:
|
|
english = match[0].strip() if match[0] else ""
|
|
german = match[1].strip() if match[1] else ""
|
|
example = match[2].strip() if len(match) > 2 and match[2] else None
|
|
|
|
if english and german:
|
|
vocab_entry = VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=english,
|
|
german=german,
|
|
example_sentence=example,
|
|
)
|
|
vocabulary.append(vocab_entry)
|
|
|
|
if vocabulary:
|
|
logger.info(f"Regex extraction found {len(vocabulary)} entries")
|
|
return vocabulary
|
|
|
|
# Normal JSON parsing succeeded
|
|
vocabulary = []
|
|
for i, entry in enumerate(data.get("vocabulary", [])):
|
|
english = entry.get("english", "").strip()
|
|
german = entry.get("german", "").strip()
|
|
|
|
# Skip entries that look like hallucinations (very long or containing unusual patterns)
|
|
if len(english) > 100 or len(german) > 200:
|
|
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
|
|
continue
|
|
|
|
if not english or not german:
|
|
continue
|
|
|
|
vocab_entry = VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=english,
|
|
german=german,
|
|
example_sentence=entry.get("example"),
|
|
word_type=entry.get("word_type"),
|
|
)
|
|
vocabulary.append(vocab_entry)
|
|
|
|
return vocabulary
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse vocabulary JSON: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return []
|
|
|
|
|
|
# =============================================================================
|
|
# Worksheet PDF Generation
|
|
# =============================================================================
|
|
|
|
def generate_worksheet_html(
|
|
vocabulary: List[VocabularyEntry],
|
|
worksheet_type: WorksheetType,
|
|
title: str,
|
|
show_solutions: bool = False,
|
|
repetitions: int = 3,
|
|
line_height: str = "normal"
|
|
) -> str:
|
|
"""Generate HTML for a worksheet."""
|
|
|
|
# Line height CSS
|
|
line_heights = {
|
|
"normal": "2.5em",
|
|
"large": "3.5em",
|
|
"extra-large": "4.5em"
|
|
}
|
|
lh = line_heights.get(line_height, "2.5em")
|
|
|
|
html = f"""<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<style>
|
|
@page {{ size: A4; margin: 2cm; }}
|
|
body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
|
|
h1 {{ font-size: 24px; margin-bottom: 10px; }}
|
|
.meta {{ color: #666; margin-bottom: 20px; }}
|
|
.name-line {{ margin-bottom: 30px; }}
|
|
.vocab-table {{ width: 100%; border-collapse: collapse; }}
|
|
.vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
|
|
.vocab-word {{ width: 40%; font-weight: 500; }}
|
|
.vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
|
|
.vocab-answer {{ width: 60%; color: #2563eb; }}
|
|
.gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
|
|
.hint {{ color: #666; font-style: italic; font-size: 12px; }}
|
|
.section {{ margin-top: 30px; }}
|
|
.section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>{title}</h1>
|
|
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
|
"""
|
|
|
|
if worksheet_type == WorksheetType.EN_TO_DE:
|
|
html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
|
|
html += '<table class="vocab-table">'
|
|
for entry in vocabulary:
|
|
if show_solutions:
|
|
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
|
|
else:
|
|
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
|
|
html += '</table></div>'
|
|
|
|
elif worksheet_type == WorksheetType.DE_TO_EN:
|
|
html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
|
|
html += '<table class="vocab-table">'
|
|
for entry in vocabulary:
|
|
if show_solutions:
|
|
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
|
|
else:
|
|
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
|
|
html += '</table></div>'
|
|
|
|
elif worksheet_type == WorksheetType.COPY_PRACTICE:
|
|
html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
|
|
html += '<table class="vocab-table">'
|
|
for entry in vocabulary:
|
|
html += f'<tr><td class="vocab-word">{entry.english}</td>'
|
|
html += '<td class="vocab-blank">'
|
|
if show_solutions:
|
|
html += f' {entry.english} ' * repetitions
|
|
html += '</td></tr>'
|
|
html += '</table></div>'
|
|
|
|
elif worksheet_type == WorksheetType.GAP_FILL:
|
|
entries_with_examples = [e for e in vocabulary if e.example_sentence]
|
|
if entries_with_examples:
|
|
html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
|
|
for i, entry in enumerate(entries_with_examples, 1):
|
|
# Create gap sentence by removing the English word
|
|
gap_sentence = entry.example_sentence
|
|
for word in entry.english.split():
|
|
if word.lower() in gap_sentence.lower():
|
|
gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
|
|
gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
|
|
gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
|
|
break
|
|
|
|
html += f'<p>{i}. {gap_sentence}</p>'
|
|
if show_solutions:
|
|
html += f'<p class="hint">Loesung: {entry.english}</p>'
|
|
else:
|
|
html += f'<p class="hint">({entry.german})</p>'
|
|
html += '</div>'
|
|
|
|
html += '</body></html>'
|
|
return html
|
|
|
|
|
|
async def generate_worksheet_pdf(html: str) -> bytes:
|
|
"""Generate PDF from HTML using WeasyPrint."""
|
|
try:
|
|
from weasyprint import HTML
|
|
pdf_bytes = HTML(string=html).write_pdf()
|
|
return pdf_bytes
|
|
except ImportError:
|
|
logger.warning("WeasyPrint not available, returning HTML")
|
|
return html.encode('utf-8')
|
|
except Exception as e:
|
|
logger.error(f"PDF generation failed: {e}")
|
|
raise
|
|
|
|
|
|
# =============================================================================
|
|
# API Endpoints
|
|
# =============================================================================
|
|
|
|
@router.post("/sessions", response_model=SessionResponse)
|
|
async def create_session(session: SessionCreate):
|
|
"""Create a new vocabulary extraction session."""
|
|
session_id = str(uuid.uuid4())
|
|
|
|
session_data = {
|
|
"id": session_id,
|
|
"name": session.name,
|
|
"description": session.description,
|
|
"source_language": session.source_language,
|
|
"target_language": session.target_language,
|
|
"status": SessionStatus.PENDING.value,
|
|
"vocabulary": [],
|
|
"vocabulary_count": 0,
|
|
"image_path": None,
|
|
"extraction_confidence": None,
|
|
"created_at": datetime.utcnow(),
|
|
}
|
|
|
|
_sessions[session_id] = session_data
|
|
|
|
# Create storage directory
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
|
|
return SessionResponse(
|
|
id=session_id,
|
|
name=session.name,
|
|
description=session.description,
|
|
source_language=session.source_language,
|
|
target_language=session.target_language,
|
|
status=SessionStatus.PENDING.value,
|
|
vocabulary_count=0,
|
|
image_path=None,
|
|
created_at=session_data["created_at"],
|
|
)
|
|
|
|
|
|
@router.get("/sessions", response_model=List[SessionResponse])
|
|
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
|
|
"""List all vocabulary sessions."""
|
|
sessions = sorted(
|
|
_sessions.values(),
|
|
key=lambda x: x["created_at"],
|
|
reverse=True
|
|
)[:limit]
|
|
|
|
return [
|
|
SessionResponse(
|
|
id=s["id"],
|
|
name=s["name"],
|
|
description=s.get("description"),
|
|
source_language=s["source_language"],
|
|
target_language=s["target_language"],
|
|
status=s["status"],
|
|
vocabulary_count=s.get("vocabulary_count", 0),
|
|
image_path=s.get("image_path"),
|
|
created_at=s["created_at"],
|
|
)
|
|
for s in sessions
|
|
]
|
|
|
|
|
|
@router.get("/sessions/{session_id}", response_model=SessionResponse)
|
|
async def get_session(session_id: str):
|
|
"""Get a specific session."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
s = _sessions[session_id]
|
|
return SessionResponse(
|
|
id=s["id"],
|
|
name=s["name"],
|
|
description=s.get("description"),
|
|
source_language=s["source_language"],
|
|
target_language=s["target_language"],
|
|
status=s["status"],
|
|
vocabulary_count=s.get("vocabulary_count", 0),
|
|
image_path=s.get("image_path"),
|
|
created_at=s["created_at"],
|
|
)
|
|
|
|
|
|
def get_pdf_page_count(pdf_data: bytes) -> int:
|
|
"""Get the number of pages in a PDF."""
|
|
try:
|
|
import fitz
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
count = pdf_document.page_count
|
|
pdf_document.close()
|
|
return count
|
|
except Exception as e:
|
|
logger.error(f"Failed to get PDF page count: {e}")
|
|
return 0
|
|
|
|
|
|
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
|
|
"""Convert a specific page of PDF to PNG image using PyMuPDF.
|
|
|
|
Args:
|
|
pdf_data: PDF file as bytes
|
|
page_number: 0-indexed page number
|
|
thumbnail: If True, return a smaller thumbnail image
|
|
"""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
|
|
if pdf_document.page_count == 0:
|
|
raise ValueError("PDF has no pages")
|
|
|
|
if page_number >= pdf_document.page_count:
|
|
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
|
|
|
|
page = pdf_document[page_number]
|
|
|
|
# Render page to image
|
|
# For thumbnails: lower resolution, for OCR: higher resolution
|
|
zoom = 0.5 if thumbnail else 2.0
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
|
|
png_data = pix.tobytes("png")
|
|
pdf_document.close()
|
|
|
|
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
|
|
return png_data
|
|
|
|
except ImportError:
|
|
logger.error("PyMuPDF (fitz) not installed")
|
|
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
|
|
except Exception as e:
|
|
logger.error(f"PDF conversion failed: {e}")
|
|
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
|
|
|
|
|
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
|
|
"""Convert multiple pages of PDF to PNG images.
|
|
|
|
Args:
|
|
pdf_data: PDF file as bytes
|
|
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
|
|
"""
|
|
try:
|
|
import fitz
|
|
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
|
|
if pdf_document.page_count == 0:
|
|
raise ValueError("PDF has no pages")
|
|
|
|
# If no pages specified, convert all
|
|
if pages is None:
|
|
pages = list(range(pdf_document.page_count))
|
|
|
|
images = []
|
|
zoom = 2.0
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
|
|
for page_num in pages:
|
|
if page_num < pdf_document.page_count:
|
|
page = pdf_document[page_num]
|
|
pix = page.get_pixmap(matrix=mat)
|
|
images.append(pix.tobytes("png"))
|
|
|
|
pdf_document.close()
|
|
logger.info(f"Converted {len(images)} PDF pages to images")
|
|
return images
|
|
|
|
except ImportError:
|
|
logger.error("PyMuPDF (fitz) not installed")
|
|
raise HTTPException(status_code=500, detail="PDF conversion not available")
|
|
except Exception as e:
|
|
logger.error(f"PDF conversion failed: {e}")
|
|
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
|
|
|
|
|
@router.post("/sessions/{session_id}/upload")
|
|
async def upload_image(
|
|
session_id: str,
|
|
file: UploadFile = File(...),
|
|
):
|
|
"""
|
|
Upload a textbook page image or PDF and extract vocabulary.
|
|
|
|
Supported formats: PNG, JPG, JPEG, PDF
|
|
"""
|
|
logger.info(f"Upload request for session {session_id}")
|
|
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
|
|
|
|
if session_id not in _sessions:
|
|
logger.error(f"Session {session_id} not found")
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
|
|
# Validate file type - check both extension and content type
|
|
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
|
content_type = file.content_type or ''
|
|
|
|
# Accept images and PDFs
|
|
valid_image_extensions = ['png', 'jpg', 'jpeg']
|
|
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
|
|
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
|
|
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
|
|
|
|
if not is_pdf and not is_image:
|
|
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
|
|
)
|
|
|
|
# Determine final extension for saving
|
|
if is_pdf:
|
|
save_extension = 'png' # PDFs will be converted to PNG
|
|
elif extension in valid_image_extensions:
|
|
save_extension = extension
|
|
elif content_type == 'image/png':
|
|
save_extension = 'png'
|
|
else:
|
|
save_extension = 'jpg'
|
|
|
|
# Read file content
|
|
content = await file.read()
|
|
logger.info(f"Read {len(content)} bytes from uploaded file")
|
|
|
|
# Convert PDF to image if needed
|
|
if is_pdf:
|
|
logger.info("Converting PDF to image...")
|
|
content = await convert_pdf_page_to_image(content, page_number=0)
|
|
logger.info(f"PDF converted, image size: {len(content)} bytes")
|
|
|
|
# Save image
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
image_path = os.path.join(session_dir, f"source.{save_extension}")
|
|
|
|
with open(image_path, 'wb') as f:
|
|
f.write(content)
|
|
|
|
# Update session status
|
|
session["status"] = SessionStatus.PROCESSING.value
|
|
session["image_path"] = image_path
|
|
|
|
# Extract vocabulary using Vision LLM
|
|
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
|
|
|
|
# Update session with extracted vocabulary
|
|
session["vocabulary"] = [v.dict() for v in vocabulary]
|
|
session["vocabulary_count"] = len(vocabulary)
|
|
session["extraction_confidence"] = confidence
|
|
session["status"] = SessionStatus.EXTRACTED.value
|
|
|
|
result = {
|
|
"session_id": session_id,
|
|
"filename": file.filename,
|
|
"image_path": image_path,
|
|
"vocabulary_count": len(vocabulary),
|
|
"extraction_confidence": confidence,
|
|
"status": SessionStatus.EXTRACTED.value,
|
|
}
|
|
|
|
if error:
|
|
result["error"] = error
|
|
|
|
return result
|
|
|
|
|
|
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
|
|
async def get_vocabulary(session_id: str):
|
|
"""Get extracted vocabulary for a session."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
|
|
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
|
|
|
return VocabularyResponse(
|
|
session_id=session_id,
|
|
vocabulary=vocabulary,
|
|
extraction_confidence=session.get("extraction_confidence"),
|
|
)
|
|
|
|
|
|
@router.put("/sessions/{session_id}/vocabulary")
|
|
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
|
|
"""Update vocabulary entries (for manual corrections)."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
session["vocabulary"] = [v.dict() for v in update.vocabulary]
|
|
session["vocabulary_count"] = len(update.vocabulary)
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"vocabulary_count": len(update.vocabulary),
|
|
"message": "Vocabulary updated successfully",
|
|
}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
|
|
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
|
|
"""Generate worksheet PDF(s) from extracted vocabulary."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
|
|
|
if not vocabulary:
|
|
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
|
|
|
|
worksheet_id = str(uuid.uuid4())
|
|
title = request.title or session["name"]
|
|
|
|
# Generate HTML for each worksheet type
|
|
combined_html = ""
|
|
for wtype in request.worksheet_types:
|
|
html = generate_worksheet_html(
|
|
vocabulary=vocabulary,
|
|
worksheet_type=wtype,
|
|
title=f"{title} - {wtype.value}",
|
|
show_solutions=False,
|
|
repetitions=request.repetitions,
|
|
line_height=request.line_height,
|
|
)
|
|
combined_html += html + '<div style="page-break-after: always;"></div>'
|
|
|
|
# Generate PDF
|
|
try:
|
|
pdf_bytes = await generate_worksheet_pdf(combined_html)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
|
|
|
|
# Save PDF
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
|
|
with open(pdf_path, 'wb') as f:
|
|
f.write(pdf_bytes)
|
|
|
|
# Generate solution PDF if requested
|
|
solution_path = None
|
|
if request.include_solutions:
|
|
solution_html = ""
|
|
for wtype in request.worksheet_types:
|
|
html = generate_worksheet_html(
|
|
vocabulary=vocabulary,
|
|
worksheet_type=wtype,
|
|
title=f"{title} - {wtype.value} (Loesung)",
|
|
show_solutions=True,
|
|
repetitions=request.repetitions,
|
|
line_height=request.line_height,
|
|
)
|
|
solution_html += html + '<div style="page-break-after: always;"></div>'
|
|
|
|
solution_bytes = await generate_worksheet_pdf(solution_html)
|
|
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
|
|
with open(solution_path, 'wb') as f:
|
|
f.write(solution_bytes)
|
|
|
|
# Store worksheet info
|
|
worksheet_data = {
|
|
"id": worksheet_id,
|
|
"session_id": session_id,
|
|
"worksheet_types": [wt.value for wt in request.worksheet_types],
|
|
"pdf_path": pdf_path,
|
|
"solution_path": solution_path,
|
|
"generated_at": datetime.utcnow(),
|
|
}
|
|
_worksheets[worksheet_id] = worksheet_data
|
|
|
|
# Update session status
|
|
session["status"] = SessionStatus.COMPLETED.value
|
|
|
|
return WorksheetResponse(
|
|
id=worksheet_id,
|
|
session_id=session_id,
|
|
worksheet_types=worksheet_data["worksheet_types"],
|
|
pdf_path=pdf_path,
|
|
solution_path=solution_path,
|
|
generated_at=worksheet_data["generated_at"],
|
|
)
|
|
|
|
|
|
@router.get("/worksheets/{worksheet_id}/pdf")
|
|
async def download_worksheet_pdf(worksheet_id: str):
|
|
"""Download the generated worksheet PDF."""
|
|
if worksheet_id not in _worksheets:
|
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
|
|
worksheet = _worksheets[worksheet_id]
|
|
pdf_path = worksheet["pdf_path"]
|
|
|
|
if not os.path.exists(pdf_path):
|
|
raise HTTPException(status_code=404, detail="PDF file not found")
|
|
|
|
with open(pdf_path, 'rb') as f:
|
|
pdf_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(pdf_bytes),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
|
|
)
|
|
|
|
|
|
@router.get("/worksheets/{worksheet_id}/solution")
|
|
async def download_solution_pdf(worksheet_id: str):
|
|
"""Download the solution PDF."""
|
|
if worksheet_id not in _worksheets:
|
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
|
|
worksheet = _worksheets[worksheet_id]
|
|
solution_path = worksheet.get("solution_path")
|
|
|
|
if not solution_path or not os.path.exists(solution_path):
|
|
raise HTTPException(status_code=404, detail="Solution PDF not found")
|
|
|
|
with open(solution_path, 'rb') as f:
|
|
pdf_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(pdf_bytes),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
|
|
)
|
|
|
|
|
|
@router.get("/sessions/{session_id}/image")
|
|
async def get_session_image(session_id: str):
|
|
"""Get the uploaded source image for a session."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
image_path = session.get("image_path")
|
|
|
|
if not image_path or not os.path.exists(image_path):
|
|
raise HTTPException(status_code=404, detail="Image not found")
|
|
|
|
# Determine content type
|
|
extension = image_path.split('.')[-1].lower()
|
|
content_type = {
|
|
'png': 'image/png',
|
|
'jpg': 'image/jpeg',
|
|
'jpeg': 'image/jpeg',
|
|
}.get(extension, 'application/octet-stream')
|
|
|
|
with open(image_path, 'rb') as f:
|
|
image_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(image_bytes),
|
|
media_type=content_type,
|
|
)
|
|
|
|
|
|
@router.post("/sessions/{session_id}/upload-pdf-info")
|
|
async def upload_pdf_get_info(
|
|
session_id: str,
|
|
file: UploadFile = File(...),
|
|
):
|
|
"""
|
|
Upload a PDF and get page count and thumbnails for preview.
|
|
Use this before processing to let user select pages.
|
|
"""
|
|
logger.info(f"PDF info request for session {session_id}")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
|
|
# Validate file type
|
|
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
|
content_type = file.content_type or ''
|
|
|
|
if extension != 'pdf' and content_type != 'application/pdf':
|
|
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
|
|
|
|
content = await file.read()
|
|
|
|
# Save PDF temporarily
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
pdf_path = os.path.join(session_dir, "source.pdf")
|
|
|
|
with open(pdf_path, 'wb') as f:
|
|
f.write(content)
|
|
|
|
# Get page count
|
|
page_count = get_pdf_page_count(content)
|
|
|
|
# Store PDF data in session for later processing
|
|
session["pdf_data"] = content
|
|
session["pdf_path"] = pdf_path
|
|
session["pdf_page_count"] = page_count
|
|
session["status"] = "pdf_uploaded"
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"page_count": page_count,
|
|
"filename": file.filename,
|
|
}
|
|
|
|
|
|
@router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
|
|
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
|
"""Get a thumbnail image of a specific PDF page.
|
|
|
|
Uses fitz for rendering so that page_rotations (from OCR orientation
|
|
detection) are applied consistently.
|
|
|
|
Args:
|
|
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
|
"""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
try:
|
|
import fitz
|
|
zoom = 2.0 if hires else 0.5
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
page = pdf_document[page_number]
|
|
# Apply orientation correction detected during OCR processing
|
|
rot = session.get("page_rotations", {}).get(page_number, 0)
|
|
if rot:
|
|
page.set_rotation(rot)
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
png_data = pix.tobytes("png")
|
|
pdf_document.close()
|
|
except Exception as e:
|
|
logger.error(f"PDF thumbnail failed: {e}")
|
|
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(png_data),
|
|
media_type="image/png",
|
|
)
|
|
|
|
|
|
@router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
|
|
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
|
|
"""PDF page as PNG at arbitrary resolution (for editor view).
|
|
|
|
Args:
|
|
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
|
|
"""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
|
|
try:
|
|
import fitz
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
page = pdf_document[page_number]
|
|
# Apply orientation correction detected during OCR processing
|
|
rot = session.get("page_rotations", {}).get(page_number, 0)
|
|
if rot:
|
|
page.set_rotation(rot)
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
png_data = pix.tobytes("png")
|
|
pdf_document.close()
|
|
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
|
|
except Exception as e:
|
|
logger.error(f"PDF page image failed: {e}")
|
|
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(png_data),
|
|
media_type="image/png",
|
|
)
|
|
|
|
|
|
@router.post("/sessions/{session_id}/process-single-page/{page_number}")
|
|
async def process_single_page(
|
|
session_id: str,
|
|
page_number: int,
|
|
):
|
|
"""
|
|
Process a SINGLE page of an uploaded PDF using the OCR pipeline.
|
|
|
|
Uses the multi-step CV pipeline (deskew → dewarp → columns → rows → words)
|
|
instead of LLM vision for much better extraction quality.
|
|
|
|
The frontend should call this sequentially for each page.
|
|
Returns the vocabulary for just this one page.
|
|
"""
|
|
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
|
|
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
|
rotation_deg = 0
|
|
if OCR_PIPELINE_AVAILABLE:
|
|
try:
|
|
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
|
page_vocabulary, rotation_deg = await _run_ocr_pipeline_for_page(
|
|
img_bgr, page_number, session_id,
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"success": False,
|
|
"error": f"OCR pipeline error: {e}",
|
|
"vocabulary": [],
|
|
"vocabulary_count": 0,
|
|
}
|
|
else:
|
|
# Fallback to LLM vision extraction
|
|
logger.warning("OCR pipeline not available, falling back to LLM vision")
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
|
image_data,
|
|
f"page_{page_number + 1}.png",
|
|
page_number=page_number
|
|
)
|
|
if error:
|
|
logger.warning(f"Page {page_number + 1} failed: {error}")
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"success": False,
|
|
"error": error,
|
|
"vocabulary": [],
|
|
"vocabulary_count": 0,
|
|
}
|
|
page_vocabulary = []
|
|
for entry in vocabulary:
|
|
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
|
entry_dict['source_page'] = page_number + 1
|
|
if 'id' not in entry_dict or not entry_dict['id']:
|
|
entry_dict['id'] = str(uuid.uuid4())
|
|
page_vocabulary.append(entry_dict)
|
|
|
|
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
|
|
|
# Store rotation for this page (used by image/thumbnail endpoints)
|
|
session.setdefault("page_rotations", {})[page_number] = rotation_deg
|
|
|
|
# Add to session's vocabulary (append, don't replace)
|
|
existing_vocab = session.get("vocabulary", [])
|
|
# Remove any existing entries from this page (in case of re-processing)
|
|
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
|
|
existing_vocab.extend(page_vocabulary)
|
|
session["vocabulary"] = existing_vocab
|
|
session["vocabulary_count"] = len(existing_vocab)
|
|
session["status"] = SessionStatus.EXTRACTED.value
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"success": True,
|
|
"vocabulary": page_vocabulary,
|
|
"vocabulary_count": len(page_vocabulary),
|
|
"total_vocabulary_count": len(existing_vocab),
|
|
"extraction_confidence": 0.9,
|
|
"rotation": rotation_deg,
|
|
}
|
|
|
|
|
|
async def _run_ocr_pipeline_for_page(
|
|
img_bgr: np.ndarray,
|
|
page_number: int,
|
|
vocab_session_id: str,
|
|
) -> tuple:
|
|
"""Run the full OCR pipeline on a single page image and return vocab entries.
|
|
|
|
Uses the same pipeline as the admin OCR pipeline (ocr_pipeline_api.py).
|
|
|
|
Args:
|
|
img_bgr: BGR numpy array (from render_pdf_high_res, same as admin pipeline).
|
|
page_number: 0-indexed page number.
|
|
vocab_session_id: Vocab session ID for logging.
|
|
|
|
Steps: deskew → dewarp → columns → rows → words → (LLM review)
|
|
Returns (entries, rotation_deg) where entries is a list of dicts and
|
|
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
|
"""
|
|
import time as _time
|
|
|
|
t_total = _time.time()
|
|
|
|
img_h, img_w = img_bgr.shape[:2]
|
|
logger.info(f"OCR Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
|
|
|
# 1b. Orientation detection (fix upside-down scans)
|
|
t0 = _time.time()
|
|
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
|
|
if rotation:
|
|
img_h, img_w = img_bgr.shape[:2]
|
|
logger.info(f" orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
|
|
else:
|
|
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
|
|
|
|
# 2. Create pipeline session in DB (for debugging in admin UI)
|
|
pipeline_session_id = str(uuid.uuid4())
|
|
try:
|
|
_, png_buf = cv2.imencode(".png", img_bgr)
|
|
original_png = png_buf.tobytes()
|
|
await create_pipeline_session_db(
|
|
pipeline_session_id,
|
|
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
|
|
filename=f"page_{page_number + 1}.png",
|
|
original_png=original_png,
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Could not create pipeline session in DB: {e}")
|
|
|
|
# 3. Three-pass deskew: iterative + word-alignment + text-line regression
|
|
t0 = _time.time()
|
|
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
|
|
angle_pass1 = deskew_debug.get("pass1_angle", 0.0)
|
|
angle_pass2 = deskew_debug.get("pass2_angle", 0.0)
|
|
angle_pass3 = deskew_debug.get("pass3_angle", 0.0)
|
|
|
|
logger.info(f" deskew: p1={angle_pass1:.2f} p2={angle_pass2:.2f} "
|
|
f"p3={angle_pass3:.2f} total={angle_applied:.2f} "
|
|
f"({_time.time() - t0:.1f}s)")
|
|
|
|
# 4. Dewarp
|
|
t0 = _time.time()
|
|
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
|
|
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
|
|
|
|
# 5. Column detection
|
|
t0 = _time.time()
|
|
ocr_img = create_ocr_image(dewarped_bgr)
|
|
h, w = ocr_img.shape[:2]
|
|
|
|
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
|
|
if geo_result is None:
|
|
layout_img = create_layout_image(dewarped_bgr)
|
|
regions = analyze_layout(layout_img, ocr_img)
|
|
word_dicts = None
|
|
inv = None
|
|
content_bounds = None
|
|
else:
|
|
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
|
content_w = right_x - left_x
|
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
|
|
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
|
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
|
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
|
|
content_h = bottom_y - top_y
|
|
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
|
content_bounds = (left_x, right_x, top_y, bottom_y)
|
|
|
|
logger.info(f" columns: {len(regions)} detected ({_time.time() - t0:.1f}s)")
|
|
|
|
# 6. Row detection
|
|
t0 = _time.time()
|
|
if word_dicts is None or inv is None or content_bounds is None:
|
|
# Re-run geometry detection to get intermediates
|
|
geo_result2 = detect_column_geometry(ocr_img, dewarped_bgr)
|
|
if geo_result2 is None:
|
|
raise ValueError("Column geometry detection failed — cannot detect rows")
|
|
_, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result2
|
|
content_bounds = (left_x, right_x, top_y, bottom_y)
|
|
|
|
left_x, right_x, top_y, bottom_y = content_bounds
|
|
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
|
logger.info(f" rows: {len(rows)} detected ({_time.time() - t0:.1f}s)")
|
|
|
|
# 7. Word recognition (cell-first OCR v2)
|
|
t0 = _time.time()
|
|
col_regions = regions # already PageRegion objects
|
|
|
|
# Populate row.words for word_count filtering
|
|
for row in rows:
|
|
row_y_rel = row.y - top_y
|
|
row_bottom_rel = row_y_rel + row.height
|
|
row.words = [
|
|
wd for wd in word_dicts
|
|
if row_y_rel <= wd['top'] + wd['height'] / 2 < row_bottom_rel
|
|
]
|
|
row.word_count = len(row.words)
|
|
|
|
cells, columns_meta = build_cell_grid_v2(
|
|
ocr_img, col_regions, rows, img_w, img_h,
|
|
ocr_engine="auto", img_bgr=dewarped_bgr,
|
|
)
|
|
|
|
col_types = {c['type'] for c in columns_meta}
|
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
|
logger.info(f" words: {len(cells)} cells, vocab={is_vocab} ({_time.time() - t0:.1f}s)")
|
|
|
|
if not is_vocab:
|
|
logger.warning(f" Page {page_number + 1}: layout is not vocab table "
|
|
f"(types: {col_types}), returning empty")
|
|
return []
|
|
|
|
# 8. Map cells → vocab entries
|
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
|
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
|
|
|
# 9. Optional LLM review
|
|
try:
|
|
review_result = await llm_review_entries(entries)
|
|
if review_result and review_result.get("changes"):
|
|
# Apply corrections
|
|
changes_map = {}
|
|
for ch in review_result["changes"]:
|
|
idx = ch.get("index")
|
|
if idx is not None:
|
|
changes_map[idx] = ch
|
|
for idx, ch in changes_map.items():
|
|
if 0 <= idx < len(entries):
|
|
for field in ("english", "german", "example"):
|
|
if ch.get(field) and ch[field] != entries[idx].get(field):
|
|
entries[idx][field] = ch[field]
|
|
logger.info(f" llm review: {len(review_result['changes'])} corrections applied")
|
|
except Exception as e:
|
|
logger.warning(f" llm review skipped: {e}")
|
|
|
|
# 10. Map to frontend format
|
|
page_vocabulary = []
|
|
for entry in entries:
|
|
if not entry.get("english") and not entry.get("german"):
|
|
continue # skip empty rows
|
|
page_vocabulary.append({
|
|
"id": str(uuid.uuid4()),
|
|
"english": entry.get("english", ""),
|
|
"german": entry.get("german", ""),
|
|
"example_sentence": entry.get("example", ""),
|
|
"source_page": page_number + 1,
|
|
})
|
|
|
|
# 11. Update pipeline session in DB (for admin debugging)
|
|
try:
|
|
success_dsk, dsk_buf = cv2.imencode(".png", deskewed_bgr)
|
|
deskewed_png = dsk_buf.tobytes() if success_dsk else None
|
|
success_dwp, dwp_buf = cv2.imencode(".png", dewarped_bgr)
|
|
dewarped_png = dwp_buf.tobytes() if success_dwp else None
|
|
|
|
await update_pipeline_session_db(
|
|
pipeline_session_id,
|
|
deskewed_png=deskewed_png,
|
|
dewarped_png=dewarped_png,
|
|
deskew_result={"angle_applied": round(angle_applied, 3)},
|
|
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
|
|
column_result={"columns": [{"type": r.type, "x": r.x, "y": r.y,
|
|
"width": r.width, "height": r.height}
|
|
for r in col_regions]},
|
|
row_result={"total_rows": len(rows)},
|
|
word_result={
|
|
"entry_count": len(page_vocabulary),
|
|
"layout": "vocab",
|
|
"vocab_entries": entries,
|
|
},
|
|
current_step=6,
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Could not update pipeline session: {e}")
|
|
|
|
total_duration = _time.time() - t_total
|
|
logger.info(f"OCR Pipeline page {page_number + 1}: "
|
|
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
|
|
|
return page_vocabulary, rotation
|
|
|
|
|
|
@router.post("/sessions/{session_id}/process-pages")
|
|
async def process_pdf_pages(
|
|
session_id: str,
|
|
pages: List[int] = None,
|
|
process_all: bool = False,
|
|
):
|
|
"""
|
|
Process specific pages of an uploaded PDF.
|
|
|
|
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
|
|
|
|
Args:
|
|
pages: List of 0-indexed page numbers to process
|
|
process_all: If True, process all pages
|
|
"""
|
|
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
|
|
# Determine which pages to process
|
|
if process_all:
|
|
pages = list(range(page_count))
|
|
elif pages is None or len(pages) == 0:
|
|
pages = [0] # Default to first page
|
|
|
|
# Convert selected pages to images
|
|
images = await convert_pdf_to_images(pdf_data, pages)
|
|
|
|
# Extract vocabulary from each page SEQUENTIALLY
|
|
all_vocabulary = []
|
|
total_confidence = 0.0
|
|
successful_pages = []
|
|
failed_pages = []
|
|
error_messages = []
|
|
|
|
for i, image_data in enumerate(images):
|
|
page_num = pages[i]
|
|
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
|
|
|
|
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
|
image_data,
|
|
f"page_{page_num + 1}.png",
|
|
page_number=page_num
|
|
)
|
|
|
|
if error:
|
|
failed_pages.append(page_num + 1)
|
|
error_messages.append(error)
|
|
logger.warning(f"Page {page_num + 1} failed: {error}")
|
|
else:
|
|
successful_pages.append(page_num + 1)
|
|
total_confidence += confidence
|
|
|
|
# Add page info to each entry and convert to dict
|
|
for entry in vocabulary:
|
|
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
|
entry_dict['source_page'] = page_num + 1
|
|
all_vocabulary.append(entry_dict)
|
|
|
|
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
|
|
|
|
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
|
|
|
|
# Update session
|
|
session["vocabulary"] = all_vocabulary
|
|
session["vocabulary_count"] = len(all_vocabulary)
|
|
session["extraction_confidence"] = avg_confidence
|
|
session["processed_pages"] = pages
|
|
session["successful_pages"] = successful_pages
|
|
session["failed_pages"] = failed_pages
|
|
session["status"] = SessionStatus.EXTRACTED.value
|
|
|
|
# Save first page as preview image
|
|
if images:
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
image_path = os.path.join(session_dir, "source.png")
|
|
with open(image_path, 'wb') as f:
|
|
f.write(images[0])
|
|
session["image_path"] = image_path
|
|
|
|
result = {
|
|
"session_id": session_id,
|
|
"pages_processed": len(pages),
|
|
"pages_successful": len(successful_pages),
|
|
"pages_failed": len(failed_pages),
|
|
"successful_pages": successful_pages,
|
|
"failed_pages": failed_pages,
|
|
"vocabulary_count": len(all_vocabulary),
|
|
"extraction_confidence": avg_confidence,
|
|
"status": SessionStatus.EXTRACTED.value,
|
|
}
|
|
|
|
if error_messages:
|
|
result["errors"] = error_messages
|
|
|
|
return result
|
|
|
|
|
|
@router.delete("/sessions/{session_id}")
|
|
async def delete_session(session_id: str):
|
|
"""Delete a vocabulary session and all associated files."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Delete session directory
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
if os.path.exists(session_dir):
|
|
import shutil
|
|
shutil.rmtree(session_dir)
|
|
|
|
# Remove from storage
|
|
del _sessions[session_id]
|
|
|
|
# Remove associated worksheets
|
|
for wid, ws in list(_worksheets.items()):
|
|
if ws["session_id"] == session_id:
|
|
del _worksheets[wid]
|
|
|
|
return {"message": "Session deleted successfully", "session_id": session_id}
|
|
|
|
|
|
# =============================================================================
|
|
# OCR Export Endpoints (for cross-app OCR data sharing)
|
|
# =============================================================================
|
|
|
|
OCR_EXPORT_DIR = os.path.join(LOCAL_STORAGE_PATH, "ocr-exports")
|
|
|
|
|
|
@router.post("/sessions/{session_id}/ocr-export/{page_number}")
|
|
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
|
|
"""
|
|
Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
|
|
|
|
Both apps proxy to klausur-service via /klausur-api/, so this endpoint
|
|
serves as shared storage accessible from both ports.
|
|
"""
|
|
|
|
logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
|
|
|
|
os.makedirs(OCR_EXPORT_DIR, exist_ok=True)
|
|
|
|
# Save the export data
|
|
export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json")
|
|
with open(export_path, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
# Update latest pointer
|
|
latest_path = os.path.join(OCR_EXPORT_DIR, "latest.json")
|
|
with open(latest_path, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"session_id": session_id,
|
|
"page_number": page_number,
|
|
"saved_at": datetime.utcnow().isoformat(),
|
|
}, f, ensure_ascii=False, indent=2)
|
|
|
|
return {
|
|
"success": True,
|
|
"session_id": session_id,
|
|
"page_number": page_number,
|
|
"message": "OCR export saved successfully",
|
|
}
|
|
|
|
|
|
@router.get("/sessions/{session_id}/ocr-export/{page_number}")
|
|
async def load_ocr_export(session_id: str, page_number: int):
|
|
"""Load a specific OCR export by session and page number."""
|
|
|
|
export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json")
|
|
|
|
if not os.path.exists(export_path):
|
|
raise HTTPException(status_code=404, detail="OCR export not found")
|
|
|
|
with open(export_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
return data
|
|
|
|
|
|
# =============================================================================
|
|
# OCR Compare & Grid Analysis Endpoints
|
|
# =============================================================================
|
|
|
|
|
|
@router.post("/sessions/{session_id}/compare-ocr/{page_number}")
|
|
async def compare_ocr_methods(session_id: str, page_number: int):
|
|
"""
|
|
Run multiple OCR methods on a page and compare results.
|
|
|
|
This endpoint:
|
|
1. Gets the page image from the session's uploaded PDF
|
|
2. Runs Vision LLM extraction (primary method)
|
|
3. Optionally runs Tesseract extraction
|
|
4. Compares found vocabulary across methods
|
|
5. Returns structured comparison results
|
|
|
|
page_number is 0-indexed.
|
|
"""
|
|
import httpx
|
|
import time
|
|
|
|
logger.info(f"Compare OCR for session {session_id}, page {page_number}")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
|
|
# Convert page to image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
methods_results = {}
|
|
all_vocab_sets = {}
|
|
|
|
# --- Method: Vision LLM ---
|
|
try:
|
|
start = time.time()
|
|
vocab, confidence, error = await extract_vocabulary_from_image(
|
|
image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
|
|
)
|
|
duration = time.time() - start
|
|
|
|
vocab_list = []
|
|
for v in vocab:
|
|
entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
|
|
vocab_list.append({
|
|
"english": entry.get("english", ""),
|
|
"german": entry.get("german", ""),
|
|
"example": entry.get("example_sentence", ""),
|
|
})
|
|
|
|
methods_results["vision_llm"] = {
|
|
"name": "Vision LLM",
|
|
"model": VISION_MODEL,
|
|
"duration_seconds": round(duration, 1),
|
|
"vocabulary_count": len(vocab_list),
|
|
"vocabulary": vocab_list,
|
|
"confidence": confidence,
|
|
"success": len(vocab_list) > 0 and not error,
|
|
"error": error if error else None,
|
|
}
|
|
all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
|
|
except Exception as e:
|
|
logger.error(f"Vision LLM failed: {e}")
|
|
methods_results["vision_llm"] = {
|
|
"name": "Vision LLM",
|
|
"model": VISION_MODEL,
|
|
"duration_seconds": 0,
|
|
"vocabulary_count": 0,
|
|
"vocabulary": [],
|
|
"confidence": 0,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
all_vocab_sets["vision_llm"] = set()
|
|
|
|
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
|
|
if TESSERACT_AVAILABLE:
|
|
try:
|
|
start = time.time()
|
|
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
|
|
duration = time.time() - start
|
|
|
|
tess_vocab = tess_result.get("vocabulary", [])
|
|
tess_words = tess_result.get("words", [])
|
|
|
|
# Store Tesseract words in session for later use (grid analysis, position matching)
|
|
session["tesseract_words"] = tess_words
|
|
session["tesseract_image_width"] = tess_result.get("image_width", 0)
|
|
session["tesseract_image_height"] = tess_result.get("image_height", 0)
|
|
session[f"tesseract_page_{page_number}"] = tess_result
|
|
|
|
vocab_list_tess = []
|
|
for v in tess_vocab:
|
|
vocab_list_tess.append({
|
|
"english": v.get("english", ""),
|
|
"german": v.get("german", ""),
|
|
"example": v.get("example", ""),
|
|
})
|
|
|
|
methods_results["tesseract"] = {
|
|
"name": "Tesseract OCR",
|
|
"model": "tesseract-ocr (eng+deu)",
|
|
"duration_seconds": round(duration, 1),
|
|
"vocabulary_count": len(vocab_list_tess),
|
|
"vocabulary": vocab_list_tess,
|
|
"confidence": 0.7 if tess_vocab else 0,
|
|
"success": len(vocab_list_tess) > 0,
|
|
"error": tess_result.get("error"),
|
|
"word_count": tess_result.get("word_count", 0),
|
|
"columns_detected": len(tess_result.get("columns", [])),
|
|
}
|
|
all_vocab_sets["tesseract"] = {
|
|
(v["english"].lower().strip(), v["german"].lower().strip())
|
|
for v in vocab_list_tess if v["english"] and v["german"]
|
|
}
|
|
|
|
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
|
|
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
|
|
llm_vocab_with_bbox = match_positions_to_vocab(
|
|
tess_words,
|
|
methods_results["vision_llm"]["vocabulary"],
|
|
tess_result.get("image_width", 1),
|
|
tess_result.get("image_height", 1),
|
|
)
|
|
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
|
|
|
|
except Exception as e:
|
|
logger.error(f"Tesseract failed: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
methods_results["tesseract"] = {
|
|
"name": "Tesseract OCR",
|
|
"model": "tesseract-ocr",
|
|
"duration_seconds": 0,
|
|
"vocabulary_count": 0,
|
|
"vocabulary": [],
|
|
"confidence": 0,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
all_vocab_sets["tesseract"] = set()
|
|
|
|
# --- Method: CV Pipeline (Document Reconstruction) ---
|
|
if CV_PIPELINE_AVAILABLE:
|
|
try:
|
|
start = time.time()
|
|
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
|
|
duration = time.time() - start
|
|
|
|
cv_vocab = cv_result.vocabulary if not cv_result.error else []
|
|
vocab_list_cv = []
|
|
for v in cv_vocab:
|
|
vocab_list_cv.append({
|
|
"english": v.get("english", ""),
|
|
"german": v.get("german", ""),
|
|
"example": v.get("example", ""),
|
|
})
|
|
|
|
methods_results["cv_pipeline"] = {
|
|
"name": "CV Pipeline (Document Reconstruction)",
|
|
"model": "opencv + tesseract (multi-pass)",
|
|
"duration_seconds": round(duration, 1),
|
|
"vocabulary_count": len(vocab_list_cv),
|
|
"vocabulary": vocab_list_cv,
|
|
"confidence": 0.8 if cv_vocab else 0,
|
|
"success": len(vocab_list_cv) > 0,
|
|
"error": cv_result.error,
|
|
"word_count": cv_result.word_count,
|
|
"columns_detected": cv_result.columns_detected,
|
|
"stages": cv_result.stages,
|
|
}
|
|
all_vocab_sets["cv_pipeline"] = {
|
|
(v["english"].lower().strip(), v["german"].lower().strip())
|
|
for v in vocab_list_cv if v["english"] and v["german"]
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"CV Pipeline failed: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
methods_results["cv_pipeline"] = {
|
|
"name": "CV Pipeline (Document Reconstruction)",
|
|
"model": "opencv + tesseract (multi-pass)",
|
|
"duration_seconds": 0,
|
|
"vocabulary_count": 0,
|
|
"vocabulary": [],
|
|
"confidence": 0,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
all_vocab_sets["cv_pipeline"] = set()
|
|
|
|
# --- Build comparison ---
|
|
all_unique = set()
|
|
for vs in all_vocab_sets.values():
|
|
all_unique |= vs
|
|
|
|
found_by_all = []
|
|
found_by_some = []
|
|
for english, german in sorted(all_unique):
|
|
found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
|
|
entry = {"english": english, "german": german, "methods": found_in}
|
|
if len(found_in) == len(all_vocab_sets):
|
|
found_by_all.append(entry)
|
|
else:
|
|
found_by_some.append(entry)
|
|
|
|
total_methods = max(len(all_vocab_sets), 1)
|
|
agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
|
|
|
|
# Find best method
|
|
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number,
|
|
"methods": methods_results,
|
|
"comparison": {
|
|
"found_by_all_methods": found_by_all,
|
|
"found_by_some_methods": found_by_some,
|
|
"total_unique_vocabulary": len(all_unique),
|
|
"agreement_rate": agreement_rate,
|
|
},
|
|
"recommendation": {
|
|
"best_method": best_method,
|
|
"reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
|
|
},
|
|
}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/analyze-grid/{page_number}")
|
|
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
|
|
"""
|
|
Analyze the grid/table structure of a vocabulary page.
|
|
|
|
Hybrid approach:
|
|
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
|
|
real spatial positions via GridDetectionService.
|
|
2. Otherwise fall back to Vision LLM for grid structure detection.
|
|
|
|
page_number is 0-indexed.
|
|
Returns GridData structure expected by the frontend GridOverlay component.
|
|
"""
|
|
import httpx
|
|
import time
|
|
|
|
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number.")
|
|
|
|
# Convert page to image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
|
|
tess_page_data = session.get(f"tesseract_page_{page_number}")
|
|
|
|
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
|
|
try:
|
|
# Run Tesseract if not already cached
|
|
if not tess_page_data:
|
|
logger.info("Running Tesseract for grid analysis (not cached)")
|
|
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
|
|
tess_page_data = await _run_tess(image_data, lang="eng+deu")
|
|
session[f"tesseract_page_{page_number}"] = tess_page_data
|
|
session["tesseract_words"] = tess_page_data.get("words", [])
|
|
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
|
|
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
|
|
|
|
tess_words = tess_page_data.get("words", [])
|
|
img_w = tess_page_data.get("image_width", 0)
|
|
img_h = tess_page_data.get("image_height", 0)
|
|
|
|
if tess_words and img_w > 0 and img_h > 0:
|
|
service = GridDetectionService()
|
|
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
|
|
|
|
if regions:
|
|
grid_result = service.detect_grid(regions)
|
|
grid_dict = grid_result.to_dict()
|
|
|
|
# Merge LLM text if available (better quality than Tesseract text)
|
|
# The LLM vocab was stored during compare-ocr
|
|
grid_dict["source"] = "tesseract+grid_service"
|
|
grid_dict["word_count"] = len(tess_words)
|
|
|
|
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
|
|
f"{grid_result.stats.get('recognized', 0)} recognized")
|
|
|
|
return {"success": True, "grid": grid_dict}
|
|
|
|
logger.info("Tesseract data insufficient, falling back to LLM")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
|
|
# --- Strategy 2: Fall back to Vision LLM ---
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|
|
|
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
|
|
|
|
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
|
|
|
|
Return a JSON object with this EXACT structure:
|
|
{
|
|
"rows": <number of rows>,
|
|
"columns": <number of columns>,
|
|
"column_types": ["english", "german", "example"],
|
|
"entries": [
|
|
{
|
|
"row": 0,
|
|
"col": 0,
|
|
"text": "the word or phrase in this cell",
|
|
"column_type": "english",
|
|
"confidence": 0.95
|
|
}
|
|
]
|
|
}
|
|
|
|
Rules:
|
|
- row and col are 0-indexed
|
|
- column_type is one of: "english", "german", "example", "unknown"
|
|
- Detect whether each column contains English words, German translations, or example sentences
|
|
- Include ALL non-empty cells
|
|
- confidence is 0.0-1.0 based on how clear the text is
|
|
- If a cell is empty, don't include it
|
|
- Return ONLY the JSON, no other text"""
|
|
|
|
try:
|
|
import asyncio
|
|
|
|
raw_text = ""
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
response = await client.post(
|
|
f"{OLLAMA_URL}/api/chat",
|
|
json={
|
|
"model": VISION_MODEL,
|
|
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 8192},
|
|
},
|
|
timeout=300.0,
|
|
)
|
|
|
|
if response.status_code == 500 and attempt < max_retries - 1:
|
|
wait_time = 10 * (attempt + 1)
|
|
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
|
await asyncio.sleep(wait_time)
|
|
continue
|
|
elif response.status_code != 200:
|
|
error_detail = response.text[:200] if response.text else "Unknown error"
|
|
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
|
|
|
|
raw_text = response.json().get("message", {}).get("content", "")
|
|
break
|
|
|
|
# Parse JSON from response
|
|
import re
|
|
json_match = re.search(r'\{[\s\S]*\}', raw_text)
|
|
if not json_match:
|
|
return {"success": False, "error": "Could not parse grid structure from LLM response"}
|
|
|
|
grid_raw = json.loads(json_match.group())
|
|
|
|
num_rows = grid_raw.get("rows", 0)
|
|
num_cols = grid_raw.get("columns", 0)
|
|
column_types = grid_raw.get("column_types", [])
|
|
entries = grid_raw.get("entries", [])
|
|
|
|
if num_rows == 0 or num_cols == 0:
|
|
return {"success": False, "error": "No grid structure detected"}
|
|
|
|
# Ensure column_types has the right length
|
|
while len(column_types) < num_cols:
|
|
column_types.append("unknown")
|
|
|
|
# Build cell grid with percentage-based coordinates
|
|
row_height = 100.0 / num_rows
|
|
col_width = 100.0 / num_cols
|
|
|
|
# Track which cells have content
|
|
cell_map = {}
|
|
for entry in entries:
|
|
r = entry.get("row", 0)
|
|
c = entry.get("col", 0)
|
|
cell_map[(r, c)] = entry
|
|
|
|
cells = []
|
|
recognized_count = 0
|
|
empty_count = 0
|
|
problematic_count = 0
|
|
|
|
for r in range(num_rows):
|
|
row_cells = []
|
|
for c in range(num_cols):
|
|
x = c * col_width
|
|
y = r * row_height
|
|
|
|
if (r, c) in cell_map:
|
|
entry = cell_map[(r, c)]
|
|
text = entry.get("text", "").strip()
|
|
conf = entry.get("confidence", 0.8)
|
|
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
|
|
|
|
if text:
|
|
status = "recognized" if conf >= 0.5 else "problematic"
|
|
if status == "recognized":
|
|
recognized_count += 1
|
|
else:
|
|
problematic_count += 1
|
|
else:
|
|
status = "empty"
|
|
empty_count += 1
|
|
else:
|
|
text = ""
|
|
conf = 0.0
|
|
col_type = column_types[c] if c < len(column_types) else "unknown"
|
|
status = "empty"
|
|
empty_count += 1
|
|
|
|
row_cells.append({
|
|
"row": r,
|
|
"col": c,
|
|
"x": round(x, 2),
|
|
"y": round(y, 2),
|
|
"width": round(col_width, 2),
|
|
"height": round(row_height, 2),
|
|
"text": text,
|
|
"confidence": conf,
|
|
"status": status,
|
|
"column_type": col_type,
|
|
})
|
|
cells.append(row_cells)
|
|
|
|
total = num_rows * num_cols
|
|
coverage = (recognized_count + problematic_count) / max(total, 1)
|
|
|
|
# Column and row boundaries as percentages
|
|
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
|
|
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
|
|
|
|
grid_data = {
|
|
"rows": num_rows,
|
|
"columns": num_cols,
|
|
"cells": cells,
|
|
"column_types": column_types,
|
|
"column_boundaries": col_boundaries,
|
|
"row_boundaries": row_boundaries,
|
|
"deskew_angle": 0.0,
|
|
"source": "vision_llm",
|
|
"stats": {
|
|
"recognized": recognized_count,
|
|
"problematic": problematic_count,
|
|
"empty": empty_count,
|
|
"manual": 0,
|
|
"total": total,
|
|
"coverage": round(coverage, 3),
|
|
},
|
|
}
|
|
|
|
return {"success": True, "grid": grid_data}
|
|
|
|
except httpx.TimeoutException:
|
|
logger.error("Grid analysis timed out")
|
|
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
|
|
except Exception as e:
|
|
logger.error(f"Grid analysis failed: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
|
|
|
|
|
|
@router.get("/ocr-export/latest")
|
|
async def load_latest_ocr_export():
|
|
"""Load the most recently saved OCR export data."""
|
|
|
|
latest_path = os.path.join(OCR_EXPORT_DIR, "latest.json")
|
|
|
|
if not os.path.exists(latest_path):
|
|
raise HTTPException(status_code=404, detail="No OCR exports found")
|
|
|
|
with open(latest_path, 'r', encoding='utf-8') as f:
|
|
pointer = json.load(f)
|
|
|
|
session_id = pointer.get("session_id")
|
|
page_number = pointer.get("page_number")
|
|
|
|
export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json")
|
|
|
|
if not os.path.exists(export_path):
|
|
raise HTTPException(status_code=404, detail="Latest OCR export file not found")
|
|
|
|
with open(export_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
return data
|
|
|
|
|
|
# =============================================================================
|
|
# Ground Truth Labeling
|
|
# =============================================================================
|
|
|
|
GROUND_TRUTH_DIR = os.path.join(LOCAL_STORAGE_PATH, "ground-truth")
|
|
|
|
|
|
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
|
|
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
|
|
|
|
Returns dict with 'entries' list and 'image_width'/'image_height'.
|
|
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
|
|
All bbox coordinates are in percent (0-100).
|
|
"""
|
|
if not TESSERACT_AVAILABLE:
|
|
raise HTTPException(status_code=500, detail="Tesseract not available")
|
|
if not GRID_SERVICE_AVAILABLE:
|
|
raise HTTPException(status_code=500, detail="GridDetectionService not available")
|
|
|
|
# Step 1: Tesseract word-level bounding boxes
|
|
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
|
|
words = tess_result.get("words", [])
|
|
img_w = tess_result.get("image_width", 0)
|
|
img_h = tess_result.get("image_height", 0)
|
|
|
|
if not words or img_w == 0 or img_h == 0:
|
|
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
|
|
|
# Step 2: Convert to OCR regions (percentage-based)
|
|
service = GridDetectionService()
|
|
regions = service.convert_tesseract_regions(words, img_w, img_h)
|
|
|
|
if not regions:
|
|
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
|
|
|
# Step 3: Detect grid
|
|
grid_result = service.detect_grid(regions)
|
|
|
|
if not grid_result.cells:
|
|
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
|
|
|
# Step 4: Group cells by logical_row and column_type
|
|
from services.grid_detection_service import ColumnType
|
|
|
|
entries = []
|
|
for row_idx, row_cells in enumerate(grid_result.cells):
|
|
en_text = ""
|
|
de_text = ""
|
|
ex_text = ""
|
|
en_bbox = None
|
|
de_bbox = None
|
|
ex_bbox = None
|
|
row_conf_sum = 0.0
|
|
row_conf_count = 0
|
|
|
|
for cell in row_cells:
|
|
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
|
|
"w": round(cell.width, 2), "h": round(cell.height, 2)}
|
|
|
|
if cell.column_type == ColumnType.ENGLISH:
|
|
en_text = cell.text.strip()
|
|
en_bbox = cell_bbox
|
|
elif cell.column_type == ColumnType.GERMAN:
|
|
de_text = cell.text.strip()
|
|
de_bbox = cell_bbox
|
|
elif cell.column_type == ColumnType.EXAMPLE:
|
|
ex_text = cell.text.strip()
|
|
ex_bbox = cell_bbox
|
|
|
|
if cell.text.strip():
|
|
row_conf_sum += cell.confidence
|
|
row_conf_count += 1
|
|
|
|
# Skip completely empty rows
|
|
if not en_text and not de_text and not ex_text:
|
|
continue
|
|
|
|
# Calculate whole-row bounding box
|
|
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
|
|
if all_bboxes:
|
|
row_x = min(b["x"] for b in all_bboxes)
|
|
row_y = min(b["y"] for b in all_bboxes)
|
|
row_right = max(b["x"] + b["w"] for b in all_bboxes)
|
|
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
|
|
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
|
|
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
|
|
else:
|
|
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
|
|
|
|
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
|
|
|
|
entries.append({
|
|
"row_index": row_idx,
|
|
"english": en_text,
|
|
"german": de_text,
|
|
"example": ex_text,
|
|
"confidence": avg_conf,
|
|
"bbox": row_bbox,
|
|
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
|
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
|
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
|
})
|
|
|
|
return {"entries": entries, "image_width": img_w, "image_height": img_h}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
|
|
async def extract_with_boxes(session_id: str, page_number: int):
|
|
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
|
|
|
|
Uses Tesseract + GridDetectionService for spatial positioning.
|
|
page_number is 0-indexed.
|
|
"""
|
|
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
|
|
# Convert page to hires image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
# Deskew image before OCR
|
|
deskew_angle = 0.0
|
|
try:
|
|
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
|
|
if CV2_AVAILABLE:
|
|
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
|
|
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
|
|
except Exception as e:
|
|
logger.warning(f"Deskew failed for page {page_number}: {e}")
|
|
|
|
# Cache deskewed image in session for later serving
|
|
if "deskewed_images" not in session:
|
|
session["deskewed_images"] = {}
|
|
session["deskewed_images"][str(page_number)] = image_data
|
|
|
|
# Extract entries with boxes (now on deskewed image)
|
|
result = await extract_entries_with_boxes(image_data)
|
|
|
|
# Cache in session
|
|
if "gt_entries" not in session:
|
|
session["gt_entries"] = {}
|
|
session["gt_entries"][str(page_number)] = result["entries"]
|
|
|
|
return {
|
|
"success": True,
|
|
"entries": result["entries"],
|
|
"entry_count": len(result["entries"]),
|
|
"image_width": result["image_width"],
|
|
"image_height": result["image_height"],
|
|
"deskew_angle": round(deskew_angle, 2),
|
|
"deskewed": abs(deskew_angle) > 0.05,
|
|
}
|
|
|
|
|
|
@router.get("/sessions/{session_id}/deskewed-image/{page_number}")
|
|
async def get_deskewed_image(session_id: str, page_number: int):
|
|
"""Return the deskewed page image as PNG.
|
|
|
|
Falls back to the original hires image if no deskewed version is cached.
|
|
"""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
deskewed = session.get("deskewed_images", {}).get(str(page_number))
|
|
|
|
if deskewed:
|
|
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
|
|
|
|
# Fallback: render original hires image
|
|
pdf_data = session.get("pdf_data")
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
|
|
|
|
|
|
@router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
|
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
|
"""Save ground truth labels for a page.
|
|
|
|
Expects body with 'entries' list - each entry has english, german, example,
|
|
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
|
|
"""
|
|
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
entries = data.get("entries", [])
|
|
if not entries:
|
|
raise HTTPException(status_code=400, detail="No entries provided")
|
|
|
|
# Save in session
|
|
session = _sessions[session_id]
|
|
if "ground_truth" not in session:
|
|
session["ground_truth"] = {}
|
|
session["ground_truth"][str(page_number)] = entries
|
|
|
|
# Also save to disk
|
|
os.makedirs(GROUND_TRUTH_DIR, exist_ok=True)
|
|
gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
|
|
gt_data = {
|
|
"session_id": session_id,
|
|
"page_number": page_number,
|
|
"saved_at": datetime.now().isoformat(),
|
|
"entry_count": len(entries),
|
|
"entries": entries,
|
|
}
|
|
with open(gt_path, 'w', encoding='utf-8') as f:
|
|
json.dump(gt_data, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
|
|
|
|
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
|
|
edited = sum(1 for e in entries if e.get("status") == "edited")
|
|
skipped = sum(1 for e in entries if e.get("status") == "skipped")
|
|
|
|
return {
|
|
"success": True,
|
|
"saved_count": len(entries),
|
|
"confirmed": confirmed,
|
|
"edited": edited,
|
|
"skipped": skipped,
|
|
"file_path": gt_path,
|
|
}
|
|
|
|
|
|
@router.get("/sessions/{session_id}/ground-truth/{page_number}")
|
|
async def load_ground_truth(session_id: str, page_number: int):
|
|
"""Load saved ground truth for a page."""
|
|
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
|
|
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Try session cache first
|
|
session = _sessions[session_id]
|
|
cached = session.get("ground_truth", {}).get(str(page_number))
|
|
if cached:
|
|
return {"success": True, "entries": cached, "source": "cache"}
|
|
|
|
# Try disk
|
|
gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
|
|
if not os.path.exists(gt_path):
|
|
raise HTTPException(status_code=404, detail="No ground truth found for this page")
|
|
|
|
with open(gt_path, 'r', encoding='utf-8') as f:
|
|
gt_data = json.load(f)
|
|
|
|
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
|