A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2066 lines
74 KiB
Python
2066 lines
74 KiB
Python
"""
|
|
Vocabulary Worksheet API - Extract vocabulary from textbook pages and generate worksheets.
|
|
|
|
DATENSCHUTZ/PRIVACY:
|
|
- Alle Verarbeitung erfolgt lokal (Mac Mini mit Ollama)
|
|
- Keine Daten werden an externe Server gesendet
|
|
- DSGVO-konform fuer Schulumgebungen
|
|
|
|
Workflow:
|
|
1. POST /sessions - Create a vocabulary extraction session
|
|
2. POST /sessions/{id}/upload - Upload textbook page image
|
|
3. GET /sessions/{id}/vocabulary - Get extracted vocabulary
|
|
4. PUT /sessions/{id}/vocabulary - Edit vocabulary (corrections)
|
|
5. POST /sessions/{id}/generate - Generate worksheet PDF
|
|
6. GET /worksheets/{id}/pdf - Download generated PDF
|
|
"""
|
|
|
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query
|
|
from fastapi.responses import StreamingResponse
|
|
from pydantic import BaseModel
|
|
from typing import Optional, List, Dict, Any
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
import uuid
|
|
import os
|
|
import io
|
|
import json
|
|
import base64
|
|
import logging
|
|
|
|
# PostgreSQL persistence (replaces in-memory storage)
|
|
from vocab_session_store import (
|
|
init_vocab_tables,
|
|
create_session_db,
|
|
get_session_db,
|
|
list_sessions_db,
|
|
update_session_db,
|
|
delete_session_db,
|
|
add_vocabulary_db,
|
|
get_vocabulary_db,
|
|
update_vocabulary_db,
|
|
clear_page_vocabulary_db,
|
|
create_worksheet_db,
|
|
get_worksheet_db,
|
|
delete_worksheets_for_session_db,
|
|
cache_pdf_data,
|
|
get_cached_pdf_data,
|
|
clear_cached_pdf_data,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Ollama Configuration - Direct call without external modules
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
|
|
|
|
# Try to import MinIO storage
|
|
try:
|
|
from minio_storage import upload_to_minio, get_from_minio
|
|
MINIO_AVAILABLE = True
|
|
except ImportError:
|
|
MINIO_AVAILABLE = False
|
|
logger.warning("MinIO storage not available, using local storage")
|
|
|
|
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
|
|
|
|
# Local storage path
|
|
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
|
|
|
|
|
# =============================================================================
|
|
# Enums and Pydantic Models
|
|
# =============================================================================
|
|
|
|
class WorksheetType(str, Enum):
|
|
EN_TO_DE = "en_to_de" # English -> German translation
|
|
DE_TO_EN = "de_to_en" # German -> English translation
|
|
COPY_PRACTICE = "copy" # Write word multiple times
|
|
GAP_FILL = "gap_fill" # Fill in the blanks
|
|
COMBINED = "combined" # All types combined
|
|
|
|
|
|
class SessionStatus(str, Enum):
|
|
PENDING = "pending" # Session created, no upload yet
|
|
PROCESSING = "processing" # OCR in progress
|
|
EXTRACTED = "extracted" # Vocabulary extracted, ready to edit
|
|
COMPLETED = "completed" # Worksheet generated
|
|
|
|
|
|
class VocabularyEntry(BaseModel):
|
|
id: str
|
|
english: str
|
|
german: str
|
|
example_sentence: Optional[str] = None
|
|
example_sentence_gap: Optional[str] = None # With ___ for gap-fill
|
|
word_type: Optional[str] = None # noun, verb, adjective, etc.
|
|
source_page: Optional[int] = None # Page number where entry was found (1-indexed)
|
|
# Grid position fields for layout-preserving OCR
|
|
source_x: Optional[float] = None # X position as percentage (0-100)
|
|
source_y: Optional[float] = None # Y position as percentage (0-100)
|
|
source_width: Optional[float] = None # Width as percentage (0-100)
|
|
source_height: Optional[float] = None # Height as percentage (0-100)
|
|
source_column: Optional[int] = None # 0-indexed column in detected grid
|
|
source_row: Optional[int] = None # 0-indexed row in detected grid
|
|
confidence: Optional[float] = None # OCR confidence score (0-1)
|
|
recognition_status: Optional[str] = None # recognized | manual | unrecognized
|
|
|
|
|
|
class OcrPrompts(BaseModel):
|
|
filterHeaders: bool = True
|
|
filterFooters: bool = True
|
|
filterPageNumbers: bool = True
|
|
customFilter: str = ""
|
|
headerPatterns: List[str] = []
|
|
footerPatterns: List[str] = []
|
|
|
|
|
|
class SessionCreate(BaseModel):
|
|
name: str
|
|
description: Optional[str] = None
|
|
source_language: str = "en" # Source language (default English)
|
|
target_language: str = "de" # Target language (default German)
|
|
ocr_prompts: Optional[OcrPrompts] = None # OCR filtering settings from frontend
|
|
|
|
|
|
class SessionResponse(BaseModel):
|
|
id: str
|
|
name: str
|
|
description: Optional[str]
|
|
source_language: str
|
|
target_language: str
|
|
status: str
|
|
vocabulary_count: int
|
|
image_path: Optional[str]
|
|
created_at: datetime
|
|
|
|
|
|
class VocabularyResponse(BaseModel):
|
|
session_id: str
|
|
vocabulary: List[VocabularyEntry]
|
|
extraction_confidence: Optional[float]
|
|
|
|
|
|
class VocabularyUpdate(BaseModel):
|
|
vocabulary: List[VocabularyEntry]
|
|
|
|
|
|
class WorksheetGenerateRequest(BaseModel):
|
|
worksheet_types: List[WorksheetType]
|
|
title: Optional[str] = None
|
|
include_solutions: bool = True
|
|
repetitions: int = 3 # For copy practice
|
|
line_height: str = "normal" # normal, large, extra-large
|
|
|
|
|
|
class WorksheetResponse(BaseModel):
|
|
id: str
|
|
session_id: str
|
|
worksheet_types: List[str]
|
|
pdf_path: str
|
|
solution_path: Optional[str]
|
|
generated_at: datetime
|
|
|
|
|
|
# =============================================================================
|
|
# PostgreSQL Storage (persistent across container restarts)
|
|
# =============================================================================
|
|
|
|
# Note: In-memory storage removed. All data now persisted in PostgreSQL.
|
|
# See vocab_session_store.py for implementation.
|
|
|
|
# Startup event to initialize tables
|
|
@router.on_event("startup")
|
|
async def startup():
|
|
"""Initialize vocab session tables on startup."""
|
|
logger.info("Initializing vocab session PostgreSQL tables...")
|
|
success = await init_vocab_tables()
|
|
if success:
|
|
logger.info("Vocab session tables ready")
|
|
else:
|
|
logger.warning("Failed to initialize vocab tables - storage may not work")
|
|
|
|
|
|
# =============================================================================
|
|
# Vision LLM Vocabulary Extraction
|
|
# =============================================================================
|
|
|
|
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
|
|
|
|
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
|
|
|
|
{
|
|
"vocabulary": [
|
|
{
|
|
"english": "to improve",
|
|
"german": "verbessern",
|
|
"example": "I want to improve my English."
|
|
}
|
|
]
|
|
}
|
|
|
|
REGELN:
|
|
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
|
|
2. Behalte die exakte Schreibweise bei
|
|
3. Bei fehlenden Beispielsaetzen: "example": null
|
|
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
|
|
5. Gib NUR valides JSON zurueck, keine Erklaerungen
|
|
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
|
|
|
|
Beispiel-Output:
|
|
{
|
|
"vocabulary": [
|
|
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
|
|
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
|
|
]
|
|
}"""
|
|
|
|
|
|
async def extract_vocabulary_from_image(
|
|
image_data: bytes,
|
|
filename: str,
|
|
page_number: int = 0,
|
|
ocr_method: str = "tesseract" # Options: "tesseract" (D), "vision_llm" (B), "paddleocr" (C)
|
|
) -> tuple[List[VocabularyEntry], float, str]:
|
|
"""
|
|
Extract vocabulary from an image using different OCR methods.
|
|
|
|
OCR Methods (documented in SBOM):
|
|
- Loesung A: User's 32B LLM (external)
|
|
- Loesung B: Vision LLM (Ollama llama3.2-vision)
|
|
- Loesung C: PaddleOCR + LLM (DEAKTIVIERT - funktioniert nicht unter Rosetta 2)
|
|
- Loesung D: Tesseract OCR + LLM (ARM64-nativ, Apache 2.0) <- DEFAULT
|
|
|
|
Args:
|
|
image_data: Image bytes
|
|
filename: Original filename for logging
|
|
page_number: 0-indexed page number for error messages
|
|
ocr_method: OCR method to use ("tesseract", "vision_llm", "paddleocr")
|
|
|
|
Returns:
|
|
Tuple of (vocabulary_entries, confidence, error_message)
|
|
error_message is empty string on success
|
|
"""
|
|
import httpx
|
|
|
|
# ==========================================================================
|
|
# LOESUNG D: Tesseract OCR + LLM Gateway (DEFAULT - ARM64-nativ)
|
|
# ==========================================================================
|
|
if ocr_method == "tesseract":
|
|
try:
|
|
from tesseract_vocab_extractor import extract_vocabulary_tesseract, is_tesseract_available
|
|
|
|
if not is_tesseract_available():
|
|
logger.warning("Tesseract not available, falling back to Vision LLM")
|
|
ocr_method = "vision_llm"
|
|
else:
|
|
logger.info(f"Using TESSERACT OCR for {filename} (Loesung D)")
|
|
|
|
vocab_dicts, confidence, error = await extract_vocabulary_tesseract(image_data, filename)
|
|
|
|
if error:
|
|
logger.warning(f"Tesseract extraction had issues: {error}")
|
|
elif vocab_dicts:
|
|
vocabulary = [
|
|
VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=v.get("source_word", "") if v.get("source_lang") == "en" else v.get("target_word", ""),
|
|
german=v.get("source_word", "") if v.get("source_lang") == "de" else v.get("target_word", ""),
|
|
example_sentence=v.get("context"),
|
|
source_page=page_number + 1
|
|
)
|
|
for v in vocab_dicts
|
|
]
|
|
logger.info(f"Tesseract extraction: {len(vocabulary)} entries from {filename}")
|
|
return vocabulary, confidence, ""
|
|
|
|
except ImportError as e:
|
|
logger.warning(f"Tesseract extractor not available: {e}. Falling back to Vision LLM.")
|
|
ocr_method = "vision_llm"
|
|
except Exception as e:
|
|
logger.warning(f"Tesseract extraction failed: {e}. Falling back to Vision LLM.")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
ocr_method = "vision_llm"
|
|
|
|
# ==========================================================================
|
|
# LOESUNG C: PaddleOCR + LLM Gateway (DEAKTIVIERT - Rosetta 2 Probleme)
|
|
# ==========================================================================
|
|
if ocr_method == "paddleocr":
|
|
try:
|
|
from hybrid_vocab_extractor import extract_vocabulary_hybrid
|
|
logger.info(f"Using PADDLEOCR for {filename} (Loesung C - experimentell)")
|
|
|
|
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
|
|
|
|
if error:
|
|
logger.warning(f"PaddleOCR extraction had issues: {error}")
|
|
elif vocab_dicts:
|
|
vocabulary = [
|
|
VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=v.get("english", ""),
|
|
german=v.get("german", ""),
|
|
example_sentence=v.get("example"),
|
|
source_page=page_number + 1
|
|
)
|
|
for v in vocab_dicts
|
|
if v.get("english") and v.get("german")
|
|
]
|
|
logger.info(f"PaddleOCR extraction: {len(vocabulary)} entries from {filename}")
|
|
return vocabulary, confidence, ""
|
|
|
|
except ImportError as e:
|
|
logger.warning(f"PaddleOCR not available: {e}. Falling back to Vision LLM.")
|
|
except Exception as e:
|
|
logger.warning(f"PaddleOCR failed: {e}. Falling back to Vision LLM.")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
|
|
# ==========================================================================
|
|
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
|
|
# ==========================================================================
|
|
logger.info(f"Using VISION LLM extraction for {filename}")
|
|
|
|
try:
|
|
# First check if Ollama is available
|
|
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
|
try:
|
|
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
|
if health_response.status_code != 200:
|
|
logger.error(f"Ollama not available at {OLLAMA_URL}")
|
|
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
|
|
except Exception as e:
|
|
logger.error(f"Ollama health check failed: {e}")
|
|
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
|
|
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|
|
|
payload = {
|
|
"model": VISION_MODEL,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": VOCAB_EXTRACTION_PROMPT,
|
|
"images": [image_base64]
|
|
}
|
|
],
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": 0.1,
|
|
"num_predict": 4096,
|
|
}
|
|
}
|
|
|
|
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
|
|
|
|
# Increased timeout for Vision models (they can be slow)
|
|
async with httpx.AsyncClient(timeout=600.0) as client:
|
|
response = await client.post(
|
|
f"{OLLAMA_URL}/api/chat",
|
|
json=payload,
|
|
timeout=300.0 # 5 minutes per page
|
|
)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
extracted_text = data.get("message", {}).get("content", "")
|
|
|
|
logger.info(f"Ollama response received: {len(extracted_text)} chars")
|
|
|
|
# Parse JSON from response
|
|
vocabulary = parse_vocabulary_json(extracted_text)
|
|
|
|
# Set source_page for each entry
|
|
for v in vocabulary:
|
|
v.source_page = page_number + 1
|
|
|
|
# Estimate confidence
|
|
confidence = 0.85 if len(vocabulary) > 0 else 0.1
|
|
|
|
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
|
|
|
|
return vocabulary, confidence, ""
|
|
|
|
except httpx.TimeoutException:
|
|
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
|
|
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
|
|
except Exception as e:
|
|
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
|
|
|
|
|
|
def _get_demo_vocabulary() -> List[VocabularyEntry]:
|
|
"""Return demo vocabulary for testing when Vision LLM is not available."""
|
|
demo_entries = [
|
|
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
|
|
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
|
|
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
|
|
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
|
|
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
|
|
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
|
|
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
|
|
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
|
|
]
|
|
return [
|
|
VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=e["english"],
|
|
german=e["german"],
|
|
example_sentence=e.get("example"),
|
|
)
|
|
for e in demo_entries
|
|
]
|
|
|
|
|
|
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
|
|
"""Parse vocabulary JSON from LLM response with robust error handling."""
|
|
import re
|
|
|
|
def clean_json_string(s: str) -> str:
|
|
"""Clean a JSON string by removing control characters and fixing common issues."""
|
|
# Remove control characters except newlines and tabs
|
|
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
|
# Replace unescaped newlines within strings with space
|
|
# This is a simplistic approach - replace actual newlines with escaped ones
|
|
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
|
return s
|
|
|
|
def try_parse_json(json_str: str) -> dict:
|
|
"""Try multiple strategies to parse JSON."""
|
|
# Strategy 1: Direct parse
|
|
try:
|
|
return json.loads(json_str)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Strategy 2: Clean and parse
|
|
try:
|
|
cleaned = clean_json_string(json_str)
|
|
return json.loads(cleaned)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Strategy 3: Try to fix common issues
|
|
try:
|
|
# Remove trailing commas before } or ]
|
|
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
|
# Fix unquoted keys
|
|
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
|
|
return json.loads(fixed)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return None
|
|
|
|
try:
|
|
# Find JSON in response (may have extra text)
|
|
start = text.find('{')
|
|
end = text.rfind('}') + 1
|
|
|
|
if start == -1 or end == 0:
|
|
logger.warning("No JSON found in response")
|
|
return []
|
|
|
|
json_str = text[start:end]
|
|
data = try_parse_json(json_str)
|
|
|
|
if data is None:
|
|
# Strategy 4: Extract vocabulary entries using regex as fallback
|
|
logger.warning("JSON parsing failed, trying regex extraction")
|
|
vocabulary = []
|
|
# Match patterns like {"english": "...", "german": "...", ...}
|
|
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
|
|
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
|
|
|
|
for match in matches:
|
|
english = match[0].strip() if match[0] else ""
|
|
german = match[1].strip() if match[1] else ""
|
|
example = match[2].strip() if len(match) > 2 and match[2] else None
|
|
|
|
if english and german:
|
|
vocab_entry = VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=english,
|
|
german=german,
|
|
example_sentence=example,
|
|
)
|
|
vocabulary.append(vocab_entry)
|
|
|
|
if vocabulary:
|
|
logger.info(f"Regex extraction found {len(vocabulary)} entries")
|
|
return vocabulary
|
|
|
|
# Normal JSON parsing succeeded
|
|
vocabulary = []
|
|
for i, entry in enumerate(data.get("vocabulary", [])):
|
|
english = entry.get("english", "").strip()
|
|
german = entry.get("german", "").strip()
|
|
|
|
# Skip entries that look like hallucinations (very long or containing unusual patterns)
|
|
if len(english) > 100 or len(german) > 200:
|
|
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
|
|
continue
|
|
|
|
if not english or not german:
|
|
continue
|
|
|
|
vocab_entry = VocabularyEntry(
|
|
id=str(uuid.uuid4()),
|
|
english=english,
|
|
german=german,
|
|
example_sentence=entry.get("example"),
|
|
word_type=entry.get("word_type"),
|
|
)
|
|
vocabulary.append(vocab_entry)
|
|
|
|
return vocabulary
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse vocabulary JSON: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return []
|
|
|
|
|
|
# =============================================================================
|
|
# Worksheet PDF Generation
|
|
# =============================================================================
|
|
|
|
def generate_worksheet_html(
|
|
vocabulary: List[VocabularyEntry],
|
|
worksheet_type: WorksheetType,
|
|
title: str,
|
|
show_solutions: bool = False,
|
|
repetitions: int = 3,
|
|
line_height: str = "normal"
|
|
) -> str:
|
|
"""Generate HTML for a worksheet."""
|
|
|
|
# Line height CSS
|
|
line_heights = {
|
|
"normal": "2.5em",
|
|
"large": "3.5em",
|
|
"extra-large": "4.5em"
|
|
}
|
|
lh = line_heights.get(line_height, "2.5em")
|
|
|
|
html = f"""<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<style>
|
|
@page {{ size: A4; margin: 2cm; }}
|
|
body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
|
|
h1 {{ font-size: 24px; margin-bottom: 10px; }}
|
|
.meta {{ color: #666; margin-bottom: 20px; }}
|
|
.name-line {{ margin-bottom: 30px; }}
|
|
.vocab-table {{ width: 100%; border-collapse: collapse; }}
|
|
.vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
|
|
.vocab-word {{ width: 40%; font-weight: 500; }}
|
|
.vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
|
|
.vocab-answer {{ width: 60%; color: #2563eb; }}
|
|
.gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
|
|
.hint {{ color: #666; font-style: italic; font-size: 12px; }}
|
|
.section {{ margin-top: 30px; }}
|
|
.section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>{title}</h1>
|
|
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
|
"""
|
|
|
|
if worksheet_type == WorksheetType.EN_TO_DE:
|
|
html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
|
|
html += '<table class="vocab-table">'
|
|
for entry in vocabulary:
|
|
if show_solutions:
|
|
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
|
|
else:
|
|
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
|
|
html += '</table></div>'
|
|
|
|
elif worksheet_type == WorksheetType.DE_TO_EN:
|
|
html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
|
|
html += '<table class="vocab-table">'
|
|
for entry in vocabulary:
|
|
if show_solutions:
|
|
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
|
|
else:
|
|
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
|
|
html += '</table></div>'
|
|
|
|
elif worksheet_type == WorksheetType.COPY_PRACTICE:
|
|
html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
|
|
html += '<table class="vocab-table">'
|
|
for entry in vocabulary:
|
|
html += f'<tr><td class="vocab-word">{entry.english}</td>'
|
|
html += '<td class="vocab-blank">'
|
|
if show_solutions:
|
|
html += f' {entry.english} ' * repetitions
|
|
html += '</td></tr>'
|
|
html += '</table></div>'
|
|
|
|
elif worksheet_type == WorksheetType.GAP_FILL:
|
|
entries_with_examples = [e for e in vocabulary if e.example_sentence]
|
|
if entries_with_examples:
|
|
html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
|
|
for i, entry in enumerate(entries_with_examples, 1):
|
|
# Create gap sentence by removing the English word
|
|
gap_sentence = entry.example_sentence
|
|
for word in entry.english.split():
|
|
if word.lower() in gap_sentence.lower():
|
|
gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
|
|
gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
|
|
gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
|
|
break
|
|
|
|
html += f'<p>{i}. {gap_sentence}</p>'
|
|
if show_solutions:
|
|
html += f'<p class="hint">Loesung: {entry.english}</p>'
|
|
else:
|
|
html += f'<p class="hint">({entry.german})</p>'
|
|
html += '</div>'
|
|
|
|
html += '</body></html>'
|
|
return html
|
|
|
|
|
|
async def generate_worksheet_pdf(html: str) -> bytes:
|
|
"""Generate PDF from HTML using WeasyPrint."""
|
|
try:
|
|
from weasyprint import HTML
|
|
pdf_bytes = HTML(string=html).write_pdf()
|
|
return pdf_bytes
|
|
except ImportError:
|
|
logger.warning("WeasyPrint not available, returning HTML")
|
|
return html.encode('utf-8')
|
|
except Exception as e:
|
|
logger.error(f"PDF generation failed: {e}")
|
|
raise
|
|
|
|
|
|
# =============================================================================
|
|
# API Endpoints
|
|
# =============================================================================
|
|
|
|
@router.post("/sessions", response_model=SessionResponse)
|
|
async def create_session(session: SessionCreate):
|
|
"""Create a new vocabulary extraction session."""
|
|
session_id = str(uuid.uuid4())
|
|
|
|
# Store in PostgreSQL
|
|
db_session = await create_session_db(
|
|
session_id=session_id,
|
|
name=session.name,
|
|
description=session.description,
|
|
source_language=session.source_language,
|
|
target_language=session.target_language,
|
|
ocr_prompts=session.ocr_prompts.model_dump() if session.ocr_prompts else None,
|
|
)
|
|
|
|
if db_session is None:
|
|
raise HTTPException(status_code=500, detail="Failed to create session in database")
|
|
|
|
# Create storage directory for files
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
|
|
return SessionResponse(
|
|
id=session_id,
|
|
name=session.name,
|
|
description=session.description,
|
|
source_language=session.source_language,
|
|
target_language=session.target_language,
|
|
status=SessionStatus.PENDING.value,
|
|
vocabulary_count=0,
|
|
image_path=None,
|
|
created_at=db_session.created_at or datetime.utcnow(),
|
|
)
|
|
|
|
|
|
@router.get("/sessions", response_model=List[SessionResponse])
|
|
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
|
|
"""List all vocabulary sessions."""
|
|
sessions = await list_sessions_db(limit=limit)
|
|
|
|
return [
|
|
SessionResponse(
|
|
id=s.id,
|
|
name=s.name,
|
|
description=s.description,
|
|
source_language=s.source_language,
|
|
target_language=s.target_language,
|
|
status=s.status,
|
|
vocabulary_count=s.vocabulary_count,
|
|
image_path=s.image_path,
|
|
created_at=s.created_at or datetime.utcnow(),
|
|
)
|
|
for s in sessions
|
|
]
|
|
|
|
|
|
@router.get("/sessions/{session_id}", response_model=SessionResponse)
|
|
async def get_session(session_id: str):
|
|
"""Get a specific session."""
|
|
s = await get_session_db(session_id)
|
|
if s is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
return SessionResponse(
|
|
id=s.id,
|
|
name=s.name,
|
|
description=s.description,
|
|
source_language=s.source_language,
|
|
target_language=s.target_language,
|
|
status=s.status,
|
|
vocabulary_count=s.vocabulary_count,
|
|
image_path=s.image_path,
|
|
created_at=s.created_at or datetime.utcnow(),
|
|
)
|
|
|
|
|
|
def get_pdf_page_count(pdf_data: bytes) -> int:
|
|
"""Get the number of pages in a PDF."""
|
|
try:
|
|
import fitz
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
count = pdf_document.page_count
|
|
pdf_document.close()
|
|
return count
|
|
except Exception as e:
|
|
logger.error(f"Failed to get PDF page count: {e}")
|
|
return 0
|
|
|
|
|
|
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
|
|
"""Convert a specific page of PDF to PNG image using PyMuPDF.
|
|
|
|
Args:
|
|
pdf_data: PDF file as bytes
|
|
page_number: 0-indexed page number
|
|
thumbnail: If True, return a smaller thumbnail image
|
|
"""
|
|
import gc
|
|
pix = None
|
|
pdf_document = None
|
|
|
|
try:
|
|
import fitz # PyMuPDF
|
|
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
|
|
if pdf_document.page_count == 0:
|
|
raise ValueError("PDF has no pages")
|
|
|
|
if page_number >= pdf_document.page_count:
|
|
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
|
|
|
|
page = pdf_document[page_number]
|
|
|
|
# Render page to image
|
|
# For thumbnails: lower resolution, for OCR: higher resolution
|
|
zoom = 0.5 if thumbnail else 2.0
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
pix = page.get_pixmap(matrix=mat)
|
|
|
|
png_data = pix.tobytes("png")
|
|
|
|
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
|
|
return png_data
|
|
|
|
except ImportError:
|
|
logger.error("PyMuPDF (fitz) not installed")
|
|
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
|
|
except Exception as e:
|
|
logger.error(f"PDF conversion failed: {e}")
|
|
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
|
finally:
|
|
# Explicit cleanup to prevent OOM
|
|
if pix is not None:
|
|
del pix
|
|
if pdf_document is not None:
|
|
pdf_document.close()
|
|
del pdf_document
|
|
gc.collect()
|
|
|
|
|
|
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
|
|
"""Convert multiple pages of PDF to PNG images.
|
|
|
|
Args:
|
|
pdf_data: PDF file as bytes
|
|
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
|
|
"""
|
|
import gc
|
|
pdf_document = None
|
|
|
|
try:
|
|
import fitz
|
|
|
|
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
|
|
if pdf_document.page_count == 0:
|
|
raise ValueError("PDF has no pages")
|
|
|
|
# If no pages specified, convert all
|
|
if pages is None:
|
|
pages = list(range(pdf_document.page_count))
|
|
|
|
images = []
|
|
zoom = 2.0
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
|
|
for page_num in pages:
|
|
if page_num < pdf_document.page_count:
|
|
page = pdf_document[page_num]
|
|
pix = page.get_pixmap(matrix=mat)
|
|
images.append(pix.tobytes("png"))
|
|
# Cleanup pixmap immediately to prevent memory buildup
|
|
del pix
|
|
gc.collect()
|
|
|
|
logger.info(f"Converted {len(images)} PDF pages to images")
|
|
return images
|
|
|
|
except ImportError:
|
|
logger.error("PyMuPDF (fitz) not installed")
|
|
raise HTTPException(status_code=500, detail="PDF conversion not available")
|
|
except Exception as e:
|
|
logger.error(f"PDF conversion failed: {e}")
|
|
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
|
finally:
|
|
if pdf_document is not None:
|
|
pdf_document.close()
|
|
del pdf_document
|
|
gc.collect()
|
|
|
|
|
|
@router.post("/sessions/{session_id}/upload")
|
|
async def upload_image(
|
|
session_id: str,
|
|
file: UploadFile = File(...),
|
|
):
|
|
"""
|
|
Upload a textbook page image or PDF and extract vocabulary.
|
|
|
|
Supported formats: PNG, JPG, JPEG, PDF
|
|
"""
|
|
logger.info(f"Upload request for session {session_id}")
|
|
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
logger.error(f"Session {session_id} not found")
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Validate file type - check both extension and content type
|
|
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
|
content_type = file.content_type or ''
|
|
|
|
# Accept images and PDFs
|
|
valid_image_extensions = ['png', 'jpg', 'jpeg']
|
|
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
|
|
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
|
|
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
|
|
|
|
if not is_pdf and not is_image:
|
|
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
|
|
)
|
|
|
|
# Determine final extension for saving
|
|
if is_pdf:
|
|
save_extension = 'png' # PDFs will be converted to PNG
|
|
elif extension in valid_image_extensions:
|
|
save_extension = extension
|
|
elif content_type == 'image/png':
|
|
save_extension = 'png'
|
|
else:
|
|
save_extension = 'jpg'
|
|
|
|
# Read file content
|
|
content = await file.read()
|
|
logger.info(f"Read {len(content)} bytes from uploaded file")
|
|
|
|
# Convert PDF to image if needed (first page only for single upload)
|
|
if is_pdf:
|
|
logger.info("Converting PDF to image...")
|
|
content = await convert_pdf_page_to_image(content, page_number=0, thumbnail=False)
|
|
logger.info(f"PDF converted, image size: {len(content)} bytes")
|
|
|
|
# Save image
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
image_path = os.path.join(session_dir, f"source.{save_extension}")
|
|
|
|
with open(image_path, 'wb') as f:
|
|
f.write(content)
|
|
|
|
# Update session status in DB
|
|
await update_session_db(session_id, status=SessionStatus.PROCESSING.value, image_path=image_path)
|
|
|
|
# Extract vocabulary using Vision LLM
|
|
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
|
|
|
|
# Store vocabulary in DB
|
|
vocab_dicts = [v.dict() for v in vocabulary]
|
|
await add_vocabulary_db(session_id, vocab_dicts)
|
|
|
|
# Update session with extraction results
|
|
await update_session_db(
|
|
session_id,
|
|
status=SessionStatus.EXTRACTED.value,
|
|
extraction_confidence=confidence,
|
|
vocabulary_count=len(vocabulary),
|
|
)
|
|
|
|
result = {
|
|
"session_id": session_id,
|
|
"filename": file.filename,
|
|
"image_path": image_path,
|
|
"vocabulary_count": len(vocabulary),
|
|
"extraction_confidence": confidence,
|
|
"status": SessionStatus.EXTRACTED.value,
|
|
}
|
|
|
|
if error:
|
|
result["error"] = error
|
|
|
|
return result
|
|
|
|
|
|
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
|
|
async def get_vocabulary(session_id: str):
|
|
"""Get extracted vocabulary for a session."""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
vocab_dicts = await get_vocabulary_db(session_id)
|
|
vocabulary = [VocabularyEntry(**v) for v in vocab_dicts]
|
|
|
|
return VocabularyResponse(
|
|
session_id=session_id,
|
|
vocabulary=vocabulary,
|
|
extraction_confidence=session.extraction_confidence,
|
|
)
|
|
|
|
|
|
@router.put("/sessions/{session_id}/vocabulary")
|
|
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
|
|
"""Update vocabulary entries (for manual corrections)."""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Replace all vocabulary entries
|
|
vocab_dicts = [v.dict() for v in update.vocabulary]
|
|
success = await update_vocabulary_db(session_id, vocab_dicts)
|
|
|
|
if not success:
|
|
raise HTTPException(status_code=500, detail="Failed to update vocabulary")
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"vocabulary_count": len(update.vocabulary),
|
|
"message": "Vocabulary updated successfully",
|
|
}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
|
|
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
|
|
"""Generate worksheet PDF(s) from extracted vocabulary."""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
vocab_dicts = await get_vocabulary_db(session_id)
|
|
vocabulary = [VocabularyEntry(**v) for v in vocab_dicts]
|
|
|
|
if not vocabulary:
|
|
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
|
|
|
|
worksheet_id = str(uuid.uuid4())
|
|
title = request.title or session.name
|
|
|
|
# Generate HTML for each worksheet type
|
|
combined_html = ""
|
|
for wtype in request.worksheet_types:
|
|
html = generate_worksheet_html(
|
|
vocabulary=vocabulary,
|
|
worksheet_type=wtype,
|
|
title=f"{title} - {wtype.value}",
|
|
show_solutions=False,
|
|
repetitions=request.repetitions,
|
|
line_height=request.line_height,
|
|
)
|
|
combined_html += html + '<div style="page-break-after: always;"></div>'
|
|
|
|
# Generate PDF
|
|
try:
|
|
pdf_bytes = await generate_worksheet_pdf(combined_html)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
|
|
|
|
# Save PDF
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
|
|
with open(pdf_path, 'wb') as f:
|
|
f.write(pdf_bytes)
|
|
|
|
# Generate solution PDF if requested
|
|
solution_path = None
|
|
if request.include_solutions:
|
|
solution_html = ""
|
|
for wtype in request.worksheet_types:
|
|
html = generate_worksheet_html(
|
|
vocabulary=vocabulary,
|
|
worksheet_type=wtype,
|
|
title=f"{title} - {wtype.value} (Loesung)",
|
|
show_solutions=True,
|
|
repetitions=request.repetitions,
|
|
line_height=request.line_height,
|
|
)
|
|
solution_html += html + '<div style="page-break-after: always;"></div>'
|
|
|
|
solution_bytes = await generate_worksheet_pdf(solution_html)
|
|
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
|
|
with open(solution_path, 'wb') as f:
|
|
f.write(solution_bytes)
|
|
|
|
# Store worksheet info in DB
|
|
worksheet = await create_worksheet_db(
|
|
worksheet_id=worksheet_id,
|
|
session_id=session_id,
|
|
worksheet_types=[wt.value for wt in request.worksheet_types],
|
|
pdf_path=pdf_path,
|
|
solution_path=solution_path,
|
|
)
|
|
|
|
if worksheet is None:
|
|
raise HTTPException(status_code=500, detail="Failed to save worksheet to database")
|
|
|
|
# Update session status
|
|
await update_session_db(session_id, status=SessionStatus.COMPLETED.value)
|
|
|
|
return WorksheetResponse(
|
|
id=worksheet_id,
|
|
session_id=session_id,
|
|
worksheet_types=worksheet.worksheet_types,
|
|
pdf_path=pdf_path,
|
|
solution_path=solution_path,
|
|
generated_at=worksheet.generated_at or datetime.utcnow(),
|
|
)
|
|
|
|
|
|
@router.get("/worksheets/{worksheet_id}/pdf")
|
|
async def download_worksheet_pdf(worksheet_id: str):
|
|
"""Download the generated worksheet PDF."""
|
|
worksheet = await get_worksheet_db(worksheet_id)
|
|
if worksheet is None:
|
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
|
|
pdf_path = worksheet.pdf_path
|
|
|
|
if not pdf_path or not os.path.exists(pdf_path):
|
|
raise HTTPException(status_code=404, detail="PDF file not found")
|
|
|
|
with open(pdf_path, 'rb') as f:
|
|
pdf_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(pdf_bytes),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
|
|
)
|
|
|
|
|
|
@router.get("/worksheets/{worksheet_id}/solution")
|
|
async def download_solution_pdf(worksheet_id: str):
|
|
"""Download the solution PDF."""
|
|
worksheet = await get_worksheet_db(worksheet_id)
|
|
if worksheet is None:
|
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
|
|
solution_path = worksheet.solution_path
|
|
|
|
if not solution_path or not os.path.exists(solution_path):
|
|
raise HTTPException(status_code=404, detail="Solution PDF not found")
|
|
|
|
with open(solution_path, 'rb') as f:
|
|
pdf_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(pdf_bytes),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
|
|
)
|
|
|
|
|
|
@router.get("/sessions/{session_id}/image")
|
|
async def get_session_image(session_id: str):
|
|
"""Get the uploaded source image for a session."""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
image_path = session.image_path
|
|
|
|
if not image_path or not os.path.exists(image_path):
|
|
raise HTTPException(status_code=404, detail="Image not found")
|
|
|
|
# Determine content type
|
|
extension = image_path.split('.')[-1].lower()
|
|
content_type = {
|
|
'png': 'image/png',
|
|
'jpg': 'image/jpeg',
|
|
'jpeg': 'image/jpeg',
|
|
}.get(extension, 'application/octet-stream')
|
|
|
|
with open(image_path, 'rb') as f:
|
|
image_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(image_bytes),
|
|
media_type=content_type,
|
|
)
|
|
|
|
|
|
@router.post("/sessions/{session_id}/upload-pdf-info")
|
|
async def upload_pdf_get_info(
|
|
session_id: str,
|
|
file: UploadFile = File(...),
|
|
):
|
|
"""
|
|
Upload a PDF and get page count and thumbnails for preview.
|
|
Use this before processing to let user select pages.
|
|
"""
|
|
logger.info(f"PDF info request for session {session_id}")
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Validate file type
|
|
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
|
content_type = file.content_type or ''
|
|
|
|
if extension != 'pdf' and content_type != 'application/pdf':
|
|
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
|
|
|
|
content = await file.read()
|
|
|
|
# Save PDF temporarily
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
pdf_path = os.path.join(session_dir, "source.pdf")
|
|
|
|
with open(pdf_path, 'wb') as f:
|
|
f.write(content)
|
|
|
|
# Get page count
|
|
page_count = get_pdf_page_count(content)
|
|
|
|
# Cache PDF data for later processing (in-memory for multi-page workflow)
|
|
cache_pdf_data(session_id, content)
|
|
|
|
# Update session in DB
|
|
await update_session_db(
|
|
session_id,
|
|
pdf_path=pdf_path,
|
|
pdf_page_count=page_count,
|
|
status="pdf_uploaded",
|
|
)
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"page_count": page_count,
|
|
"filename": file.filename,
|
|
}
|
|
|
|
|
|
@router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
|
|
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = False):
|
|
"""Get a thumbnail image of a specific PDF page.
|
|
|
|
Args:
|
|
session_id: Session ID
|
|
page_number: 0-indexed page number
|
|
hires: If True, return high-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5)
|
|
"""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Try cached PDF data first
|
|
pdf_data = get_cached_pdf_data(session_id)
|
|
|
|
# If not cached, try to load from file
|
|
if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
|
|
with open(session.pdf_path, 'rb') as f:
|
|
pdf_data = f.read()
|
|
cache_pdf_data(session_id, pdf_data)
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
# Use thumbnail=False for high-res (zoom=2.0), thumbnail=True for low-res (zoom=0.5)
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=not hires)
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(image_data),
|
|
media_type="image/png",
|
|
)
|
|
|
|
|
|
@router.post("/sessions/{session_id}/process-single-page/{page_number}")
|
|
async def process_single_page(
|
|
session_id: str,
|
|
page_number: int,
|
|
):
|
|
"""
|
|
Process a SINGLE page of an uploaded PDF - completely isolated.
|
|
|
|
This endpoint processes one page at a time to avoid LLM context issues.
|
|
The frontend should call this sequentially for each page.
|
|
|
|
Returns the vocabulary for just this one page.
|
|
"""
|
|
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Try cached PDF data first
|
|
pdf_data = get_cached_pdf_data(session_id)
|
|
|
|
# If not cached, try to load from file
|
|
if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
|
|
with open(session.pdf_path, 'rb') as f:
|
|
pdf_data = f.read()
|
|
cache_pdf_data(session_id, pdf_data)
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.pdf_page_count or 1
|
|
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
|
|
# Convert just this ONE page to image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
# Extract vocabulary from this single page
|
|
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
|
image_data,
|
|
f"page_{page_number + 1}.png",
|
|
page_number=page_number
|
|
)
|
|
|
|
if error:
|
|
logger.warning(f"Page {page_number + 1} failed: {error}")
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"success": False,
|
|
"error": error,
|
|
"vocabulary": [],
|
|
"vocabulary_count": 0,
|
|
}
|
|
|
|
# Convert vocabulary entries to dicts with page info
|
|
page_vocabulary = []
|
|
for entry in vocabulary:
|
|
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
|
entry_dict['source_page'] = page_number + 1
|
|
page_vocabulary.append(entry_dict)
|
|
|
|
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
|
|
|
# Clear existing entries for this page (in case of re-processing)
|
|
await clear_page_vocabulary_db(session_id, page_number + 1)
|
|
|
|
# Add new vocabulary entries to DB
|
|
await add_vocabulary_db(session_id, page_vocabulary)
|
|
|
|
# Update session status
|
|
await update_session_db(session_id, status=SessionStatus.EXTRACTED.value)
|
|
|
|
# Get total count
|
|
all_vocab = await get_vocabulary_db(session_id)
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"success": True,
|
|
"vocabulary": page_vocabulary,
|
|
"vocabulary_count": len(page_vocabulary),
|
|
"total_vocabulary_count": len(all_vocab),
|
|
"extraction_confidence": confidence,
|
|
}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/compare-ocr/{page_number}")
|
|
async def compare_ocr_methods(
|
|
session_id: str,
|
|
page_number: int,
|
|
):
|
|
"""
|
|
Compare different OCR methods on a single page.
|
|
|
|
Runs available OCR solutions and compares:
|
|
- Extraction time
|
|
- Vocabulary found
|
|
- Confidence scores
|
|
|
|
Solutions tested:
|
|
- Loesung B: Vision LLM (qwen2.5vl:32b via Ollama)
|
|
- Loesung D: Tesseract OCR + LLM structuring
|
|
- Loesung E: Claude Vision API (Anthropic)
|
|
|
|
Returns comparison data for frontend visualization.
|
|
"""
|
|
import time
|
|
import httpx
|
|
|
|
logger.info(f"OCR Comparison for session {session_id}, page {page_number}")
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Try cached PDF data first
|
|
pdf_data = get_cached_pdf_data(session_id)
|
|
|
|
# If not cached, try to load from file
|
|
if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
|
|
with open(session.pdf_path, 'rb') as f:
|
|
pdf_data = f.read()
|
|
cache_pdf_data(session_id, pdf_data)
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.pdf_page_count or 1
|
|
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
|
|
# Convert page to image once (shared by all methods)
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
results = {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"methods": {}
|
|
}
|
|
|
|
# ==========================================================================
|
|
# LOESUNG B: Vision LLM (qwen2.5vl:32b)
|
|
# ==========================================================================
|
|
try:
|
|
start_time = time.time()
|
|
vocab_b, confidence_b, error_b = await extract_vocabulary_from_image(
|
|
image_data, f"page_{page_number + 1}.png", page_number, ocr_method="vision_llm"
|
|
)
|
|
duration_b = time.time() - start_time
|
|
|
|
results["methods"]["vision_llm"] = {
|
|
"name": "Loesung B: Vision LLM",
|
|
"model": VISION_MODEL,
|
|
"duration_seconds": round(duration_b, 2),
|
|
"vocabulary_count": len(vocab_b),
|
|
"vocabulary": [
|
|
{"english": v.english, "german": v.german, "example": v.example_sentence}
|
|
for v in vocab_b
|
|
],
|
|
"confidence": confidence_b,
|
|
"error": error_b if error_b else None,
|
|
"success": len(vocab_b) > 0
|
|
}
|
|
logger.info(f"Vision LLM: {len(vocab_b)} entries in {duration_b:.2f}s")
|
|
except Exception as e:
|
|
results["methods"]["vision_llm"] = {
|
|
"name": "Loesung B: Vision LLM",
|
|
"error": str(e),
|
|
"success": False
|
|
}
|
|
logger.error(f"Vision LLM comparison failed: {e}")
|
|
|
|
# ==========================================================================
|
|
# LOESUNG D: Tesseract OCR + LLM
|
|
# ==========================================================================
|
|
try:
|
|
start_time = time.time()
|
|
vocab_d, confidence_d, error_d = await extract_vocabulary_from_image(
|
|
image_data, f"page_{page_number + 1}.png", page_number, ocr_method="tesseract"
|
|
)
|
|
duration_d = time.time() - start_time
|
|
|
|
results["methods"]["tesseract"] = {
|
|
"name": "Loesung D: Tesseract OCR",
|
|
"model": "tesseract + qwen2.5:14b",
|
|
"duration_seconds": round(duration_d, 2),
|
|
"vocabulary_count": len(vocab_d),
|
|
"vocabulary": [
|
|
{"english": v.english, "german": v.german, "example": v.example_sentence}
|
|
for v in vocab_d
|
|
],
|
|
"confidence": confidence_d,
|
|
"error": error_d if error_d else None,
|
|
"success": len(vocab_d) > 0
|
|
}
|
|
logger.info(f"Tesseract: {len(vocab_d)} entries in {duration_d:.2f}s")
|
|
except Exception as e:
|
|
results["methods"]["tesseract"] = {
|
|
"name": "Loesung D: Tesseract OCR",
|
|
"error": str(e),
|
|
"success": False
|
|
}
|
|
logger.error(f"Tesseract comparison failed: {e}")
|
|
|
|
# ==========================================================================
|
|
# LOESUNG E: Claude Vision API (Anthropic)
|
|
# ==========================================================================
|
|
try:
|
|
from claude_vocab_extractor import extract_vocabulary_claude, is_claude_available
|
|
|
|
if is_claude_available():
|
|
start_time = time.time()
|
|
vocab_e_raw, confidence_e, error_e = await extract_vocabulary_claude(
|
|
image_data, f"page_{page_number + 1}.png"
|
|
)
|
|
duration_e = time.time() - start_time
|
|
|
|
# Convert to consistent format
|
|
vocab_e = []
|
|
for v in vocab_e_raw:
|
|
source_word = v.get("source_word", "")
|
|
target_word = v.get("target_word", "")
|
|
source_lang = v.get("source_lang", "en")
|
|
# Determine which is English and which is German
|
|
if source_lang == "en":
|
|
english = source_word
|
|
german = target_word
|
|
else:
|
|
english = target_word
|
|
german = source_word
|
|
|
|
vocab_e.append({
|
|
"english": english,
|
|
"german": german,
|
|
"example": v.get("context", "")
|
|
})
|
|
|
|
results["methods"]["claude_vision"] = {
|
|
"name": "Loesung E: Claude Vision",
|
|
"model": "claude-sonnet-4-20250514",
|
|
"duration_seconds": round(duration_e, 2),
|
|
"vocabulary_count": len(vocab_e),
|
|
"vocabulary": vocab_e,
|
|
"confidence": confidence_e,
|
|
"error": error_e if error_e else None,
|
|
"success": len(vocab_e) > 0
|
|
}
|
|
logger.info(f"Claude Vision: {len(vocab_e)} entries in {duration_e:.2f}s")
|
|
else:
|
|
results["methods"]["claude_vision"] = {
|
|
"name": "Loesung E: Claude Vision",
|
|
"error": "Anthropic API Key nicht konfiguriert",
|
|
"success": False
|
|
}
|
|
except Exception as e:
|
|
results["methods"]["claude_vision"] = {
|
|
"name": "Loesung E: Claude Vision",
|
|
"error": str(e),
|
|
"success": False
|
|
}
|
|
logger.error(f"Claude Vision comparison failed: {e}")
|
|
|
|
# ==========================================================================
|
|
# Comparison Analysis
|
|
# ==========================================================================
|
|
all_vocab = {}
|
|
for method_key, method_data in results["methods"].items():
|
|
if method_data.get("success"):
|
|
for v in method_data.get("vocabulary", []):
|
|
key = f"{v['english']}|{v['german']}"
|
|
if key not in all_vocab:
|
|
all_vocab[key] = {"english": v["english"], "german": v["german"], "found_by": []}
|
|
all_vocab[key]["found_by"].append(method_key)
|
|
|
|
# Categorize vocabulary
|
|
found_by_all = []
|
|
found_by_some = []
|
|
|
|
num_methods = len([m for m in results["methods"].values() if m.get("success")])
|
|
|
|
for key, data in all_vocab.items():
|
|
entry = {"english": data["english"], "german": data["german"], "methods": data["found_by"]}
|
|
if len(data["found_by"]) == num_methods:
|
|
found_by_all.append(entry)
|
|
else:
|
|
found_by_some.append(entry)
|
|
|
|
results["comparison"] = {
|
|
"found_by_all_methods": found_by_all,
|
|
"found_by_some_methods": found_by_some,
|
|
"total_unique_vocabulary": len(all_vocab),
|
|
"agreement_rate": len(found_by_all) / len(all_vocab) if all_vocab else 0
|
|
}
|
|
|
|
# Determine best method
|
|
best_method = None
|
|
best_count = 0
|
|
for method_key, method_data in results["methods"].items():
|
|
if method_data.get("success") and method_data.get("vocabulary_count", 0) > best_count:
|
|
best_count = method_data["vocabulary_count"]
|
|
best_method = method_key
|
|
|
|
results["recommendation"] = {
|
|
"best_method": best_method,
|
|
"reason": f"Meiste Vokabeln erkannt ({best_count})"
|
|
}
|
|
|
|
return results
|
|
|
|
|
|
# =============================================================================
|
|
# Grid Detection and Analysis
|
|
# =============================================================================
|
|
|
|
@router.post("/sessions/{session_id}/analyze-grid/{page_number}")
|
|
async def analyze_grid(session_id: str, page_number: int):
|
|
"""
|
|
Analyze a page and detect grid structure for layout-preserving OCR.
|
|
|
|
This endpoint:
|
|
1. Applies deskewing to straighten the image
|
|
2. Runs OCR with bounding box extraction
|
|
3. Detects row and column structure
|
|
4. Identifies recognized, empty, and problematic cells
|
|
|
|
Returns grid structure with cell positions and recognition status.
|
|
"""
|
|
import numpy as np
|
|
from PIL import Image
|
|
import io
|
|
|
|
logger.info(f"Grid analysis for session {session_id}, page {page_number}")
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Get PDF data
|
|
pdf_data = get_cached_pdf_data(session_id)
|
|
if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
|
|
with open(session.pdf_path, 'rb') as f:
|
|
pdf_data = f.read()
|
|
cache_pdf_data(session_id, pdf_data)
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.pdf_page_count or 1
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Invalid page number. PDF has {page_count} pages (0-indexed)."
|
|
)
|
|
|
|
# Convert page to image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
# Load image as numpy array
|
|
img = Image.open(io.BytesIO(image_data))
|
|
img_array = np.array(img)
|
|
img_height, img_width = img_array.shape[:2]
|
|
|
|
# Step 1: Deskewing
|
|
deskew_angle = 0.0
|
|
try:
|
|
from services.image_preprocessing import deskew_image
|
|
img_array, deskew_angle = deskew_image(img_array)
|
|
logger.info(f"Applied deskew correction: {deskew_angle:.2f}°")
|
|
except ImportError:
|
|
logger.warning("Image preprocessing not available, skipping deskew")
|
|
except Exception as e:
|
|
logger.warning(f"Deskewing failed: {e}")
|
|
|
|
# Step 2: Run OCR with position data
|
|
ocr_regions = []
|
|
try:
|
|
import pytesseract
|
|
from pytesseract import Output
|
|
from services.grid_detection_service import convert_tesseract_regions
|
|
|
|
# Convert back to PIL Image if we modified it
|
|
if deskew_angle != 0:
|
|
img = Image.fromarray(img_array)
|
|
|
|
ocr_data = pytesseract.image_to_data(
|
|
img,
|
|
lang='eng+deu',
|
|
output_type=Output.DICT
|
|
)
|
|
ocr_regions = convert_tesseract_regions(ocr_data, img_width, img_height)
|
|
logger.info(f"OCR found {len(ocr_regions)} text regions")
|
|
|
|
except ImportError:
|
|
logger.warning("Tesseract not available, trying PaddleOCR")
|
|
try:
|
|
from hybrid_vocab_extractor import call_paddleocr_service
|
|
from services.grid_detection_service import convert_paddleocr_regions
|
|
|
|
# Convert to bytes for PaddleOCR
|
|
buffer = io.BytesIO()
|
|
Image.fromarray(img_array).save(buffer, format='PNG')
|
|
paddle_regions, _ = await call_paddleocr_service(buffer.getvalue())
|
|
|
|
ocr_regions = convert_paddleocr_regions(
|
|
[{"text": r.text, "confidence": r.confidence,
|
|
"bbox": [[r.x1, r.y1], [r.x2, r.y1], [r.x2, r.y2], [r.x1, r.y2]]}
|
|
for r in paddle_regions],
|
|
img_width, img_height
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"PaddleOCR also failed: {e}")
|
|
|
|
if not ocr_regions:
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"success": False,
|
|
"error": "No text regions detected",
|
|
"grid": None,
|
|
"deskew_angle": deskew_angle,
|
|
}
|
|
|
|
# Step 3: Detect grid structure
|
|
try:
|
|
from services.grid_detection_service import GridDetectionService
|
|
|
|
grid_service = GridDetectionService()
|
|
result = grid_service.detect_grid(ocr_regions, img_array, deskew_angle)
|
|
|
|
# Store grid data in session
|
|
await update_session_db(
|
|
session_id,
|
|
grid_data=result.to_dict(),
|
|
deskew_angle=deskew_angle
|
|
)
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number + 1,
|
|
"success": True,
|
|
"grid": result.to_dict(),
|
|
"deskew_angle": deskew_angle,
|
|
"image_dimensions": {
|
|
"width": img_width,
|
|
"height": img_height
|
|
}
|
|
}
|
|
|
|
except ImportError as e:
|
|
logger.error(f"Grid detection service not available: {e}")
|
|
raise HTTPException(status_code=500, detail="Grid detection service not available")
|
|
except Exception as e:
|
|
logger.error(f"Grid detection failed: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=f"Grid detection failed: {str(e)}")
|
|
|
|
|
|
@router.get("/sessions/{session_id}/grid")
|
|
async def get_grid(session_id: str):
|
|
"""
|
|
Get the stored grid structure for a session.
|
|
"""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
if not session.grid_data:
|
|
raise HTTPException(status_code=404, detail="No grid data found. Run analyze-grid first.")
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"grid": session.grid_data,
|
|
"deskew_angle": session.deskew_angle
|
|
}
|
|
|
|
|
|
@router.get("/sessions/{session_id}/cell-crop/{page_number}/{row}/{col}")
|
|
async def get_cell_crop(session_id: str, page_number: int, row: int, col: int):
|
|
"""
|
|
Get a cropped image of a specific grid cell.
|
|
|
|
Useful for showing the original image content when manually correcting cells.
|
|
"""
|
|
from PIL import Image
|
|
import io
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
if not session.grid_data:
|
|
raise HTTPException(status_code=400, detail="No grid data. Run analyze-grid first.")
|
|
|
|
# Get cell from grid
|
|
cells = session.grid_data.get("cells", [])
|
|
if row >= len(cells) or col >= len(cells[row] if row < len(cells) else []):
|
|
raise HTTPException(status_code=404, detail="Cell not found")
|
|
|
|
cell = cells[row][col]
|
|
|
|
# Get PDF image
|
|
pdf_data = get_cached_pdf_data(session_id)
|
|
if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
|
|
with open(session.pdf_path, 'rb') as f:
|
|
pdf_data = f.read()
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF data available")
|
|
|
|
# Convert page to image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
img = Image.open(io.BytesIO(image_data))
|
|
img_width, img_height = img.size
|
|
|
|
# Crop cell region
|
|
x1 = int(img_width * cell["x"] / 100)
|
|
y1 = int(img_height * cell["y"] / 100)
|
|
x2 = int(img_width * (cell["x"] + cell["width"]) / 100)
|
|
y2 = int(img_height * (cell["y"] + cell["height"]) / 100)
|
|
|
|
# Add small padding
|
|
padding = 5
|
|
x1 = max(0, x1 - padding)
|
|
y1 = max(0, y1 - padding)
|
|
x2 = min(img_width, x2 + padding)
|
|
y2 = min(img_height, y2 + padding)
|
|
|
|
cropped = img.crop((x1, y1, x2, y2))
|
|
|
|
# Convert to PNG
|
|
buffer = io.BytesIO()
|
|
cropped.save(buffer, format='PNG')
|
|
buffer.seek(0)
|
|
|
|
return StreamingResponse(buffer, media_type="image/png")
|
|
|
|
|
|
@router.put("/sessions/{session_id}/cell/{row}/{col}")
|
|
async def update_cell(session_id: str, row: int, col: int, text: str = Form(...)):
|
|
"""
|
|
Manually update the text content of a grid cell.
|
|
|
|
Sets recognition_status to 'manual' for the updated cell.
|
|
"""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
if not session.grid_data:
|
|
raise HTTPException(status_code=400, detail="No grid data. Run analyze-grid first.")
|
|
|
|
# Update cell in grid
|
|
grid_data = session.grid_data
|
|
cells = grid_data.get("cells", [])
|
|
|
|
if row >= len(cells) or col >= len(cells[row] if row < len(cells) else []):
|
|
raise HTTPException(status_code=404, detail="Cell not found")
|
|
|
|
cells[row][col]["text"] = text
|
|
cells[row][col]["status"] = "manual"
|
|
cells[row][col]["confidence"] = 1.0
|
|
|
|
# Update statistics
|
|
recognized = sum(1 for r in cells for c in r if c.get("status") == "recognized")
|
|
manual = sum(1 for r in cells for c in r if c.get("status") == "manual")
|
|
problematic = sum(1 for r in cells for c in r if c.get("status") == "problematic")
|
|
total = len(cells) * len(cells[0]) if cells and cells[0] else 0
|
|
|
|
grid_data["stats"] = {
|
|
"recognized": recognized,
|
|
"manual": manual,
|
|
"problematic": problematic,
|
|
"empty": total - recognized - manual - problematic,
|
|
"total": total,
|
|
"coverage": (recognized + manual) / total if total > 0 else 0
|
|
}
|
|
|
|
await update_session_db(session_id, grid_data=grid_data)
|
|
|
|
return {
|
|
"success": True,
|
|
"cell": cells[row][col],
|
|
"stats": grid_data["stats"]
|
|
}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/process-pages")
|
|
async def process_pdf_pages(
|
|
session_id: str,
|
|
pages: List[int] = None,
|
|
process_all: bool = False,
|
|
):
|
|
"""
|
|
Process specific pages of an uploaded PDF.
|
|
|
|
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
|
|
|
|
Args:
|
|
pages: List of 0-indexed page numbers to process
|
|
process_all: If True, process all pages
|
|
"""
|
|
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Try cached PDF data first
|
|
pdf_data = get_cached_pdf_data(session_id)
|
|
|
|
# If not cached, try to load from file
|
|
if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path):
|
|
with open(session.pdf_path, 'rb') as f:
|
|
pdf_data = f.read()
|
|
cache_pdf_data(session_id, pdf_data)
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.pdf_page_count or 1
|
|
|
|
# Determine which pages to process
|
|
if process_all:
|
|
pages = list(range(page_count))
|
|
elif pages is None or len(pages) == 0:
|
|
pages = [0] # Default to first page
|
|
|
|
# Convert selected pages to images
|
|
images = await convert_pdf_to_images(pdf_data, pages)
|
|
|
|
# Extract vocabulary from each page SEQUENTIALLY
|
|
all_vocabulary = []
|
|
total_confidence = 0.0
|
|
successful_pages = []
|
|
failed_pages = []
|
|
error_messages = []
|
|
|
|
for i, image_data in enumerate(images):
|
|
page_num = pages[i]
|
|
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
|
|
|
|
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
|
image_data,
|
|
f"page_{page_num + 1}.png",
|
|
page_number=page_num
|
|
)
|
|
|
|
if error:
|
|
failed_pages.append(page_num + 1)
|
|
error_messages.append(error)
|
|
logger.warning(f"Page {page_num + 1} failed: {error}")
|
|
else:
|
|
successful_pages.append(page_num + 1)
|
|
total_confidence += confidence
|
|
|
|
# Add page info to each entry and convert to dict
|
|
for entry in vocabulary:
|
|
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
|
entry_dict['source_page'] = page_num + 1
|
|
all_vocabulary.append(entry_dict)
|
|
|
|
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
|
|
|
|
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
|
|
|
|
# Store vocabulary in DB (replace existing)
|
|
await update_vocabulary_db(session_id, all_vocabulary)
|
|
|
|
# Save first page as preview image
|
|
image_path = None
|
|
if images:
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
image_path = os.path.join(session_dir, "source.png")
|
|
with open(image_path, 'wb') as f:
|
|
f.write(images[0])
|
|
|
|
# Update session in DB
|
|
await update_session_db(
|
|
session_id,
|
|
status=SessionStatus.EXTRACTED.value,
|
|
extraction_confidence=avg_confidence,
|
|
processed_pages=pages,
|
|
successful_pages=successful_pages,
|
|
failed_pages=failed_pages,
|
|
image_path=image_path,
|
|
)
|
|
|
|
result = {
|
|
"session_id": session_id,
|
|
"pages_processed": len(pages),
|
|
"pages_successful": len(successful_pages),
|
|
"pages_failed": len(failed_pages),
|
|
"successful_pages": successful_pages,
|
|
"failed_pages": failed_pages,
|
|
"vocabulary_count": len(all_vocabulary),
|
|
"extraction_confidence": avg_confidence,
|
|
"status": SessionStatus.EXTRACTED.value,
|
|
}
|
|
|
|
if error_messages:
|
|
result["errors"] = error_messages
|
|
|
|
return result
|
|
|
|
|
|
@router.delete("/sessions/{session_id}")
|
|
async def delete_session(session_id: str):
|
|
"""Delete a vocabulary session and all associated files."""
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Delete session directory
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
if os.path.exists(session_dir):
|
|
import shutil
|
|
shutil.rmtree(session_dir)
|
|
|
|
# Clear cached PDF data
|
|
clear_cached_pdf_data(session_id)
|
|
|
|
# Delete from database (CASCADE deletes vocab_entries and vocab_worksheets)
|
|
success = await delete_session_db(session_id)
|
|
|
|
if not success:
|
|
raise HTTPException(status_code=500, detail="Failed to delete session from database")
|
|
|
|
return {"message": "Session deleted successfully", "session_id": session_id}
|
|
|
|
|
|
# =============================================================================
|
|
# NRU Format Worksheet Generation
|
|
# =============================================================================
|
|
|
|
class NRUWorksheetRequest(BaseModel):
|
|
"""Request model for NRU format worksheet generation."""
|
|
title: Optional[str] = "Vokabeltest"
|
|
include_solutions: bool = True
|
|
specific_pages: Optional[List[int]] = None # 1-indexed page numbers, None = all
|
|
|
|
|
|
@router.post("/sessions/{session_id}/generate-nru")
|
|
async def generate_nru_worksheet(session_id: str, request: NRUWorksheetRequest):
|
|
"""
|
|
Generate worksheet PDF in NRU format.
|
|
|
|
NRU Format:
|
|
- Per scanned page, generates 2 worksheet pages:
|
|
1. Vocabulary table (3 columns: English, German blank, Correction blank)
|
|
2. Sentence practice (German sentence, 2 empty lines for English translation)
|
|
|
|
Automatically separates vocabulary entries into:
|
|
- Single words/phrases -> Vocabulary table
|
|
- Full sentences (end with . ! ? or are long) -> Sentence practice
|
|
|
|
Args:
|
|
session_id: Session with extracted vocabulary
|
|
request: Generation options (title, include_solutions, specific_pages)
|
|
|
|
Returns:
|
|
Worksheet and solution PDF download info
|
|
"""
|
|
logger.info(f"Generating NRU worksheet for session {session_id}")
|
|
|
|
session = await get_session_db(session_id)
|
|
if session is None:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
vocab_dicts = await get_vocabulary_db(session_id)
|
|
if not vocab_dicts:
|
|
raise HTTPException(status_code=400, detail="No vocabulary found in session")
|
|
|
|
# Generate PDFs using NRU format
|
|
try:
|
|
from nru_worksheet_generator import generate_nru_pdf, separate_vocab_and_sentences
|
|
|
|
# Get statistics
|
|
vocab_list, sentence_list = separate_vocab_and_sentences(vocab_dicts)
|
|
|
|
worksheet_pdf, solution_pdf = await generate_nru_pdf(
|
|
entries=vocab_dicts,
|
|
title=request.title or session.name,
|
|
include_solutions=request.include_solutions
|
|
)
|
|
|
|
# Save PDFs
|
|
worksheet_id = str(uuid.uuid4())
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
|
|
pdf_path = os.path.join(session_dir, f"nru_worksheet_{worksheet_id}.pdf")
|
|
with open(pdf_path, 'wb') as f:
|
|
f.write(worksheet_pdf)
|
|
|
|
solution_path = None
|
|
if solution_pdf:
|
|
solution_path = os.path.join(session_dir, f"nru_solution_{worksheet_id}.pdf")
|
|
with open(solution_path, 'wb') as f:
|
|
f.write(solution_pdf)
|
|
|
|
# Store worksheet info
|
|
await create_worksheet_db(
|
|
worksheet_id=worksheet_id,
|
|
session_id=session_id,
|
|
worksheet_types=["nru_format"],
|
|
pdf_path=pdf_path,
|
|
solution_path=solution_path,
|
|
)
|
|
|
|
# Get unique pages
|
|
pages = sorted(set(v.get("source_page", 1) for v in vocab_dicts))
|
|
|
|
return {
|
|
"worksheet_id": worksheet_id,
|
|
"session_id": session_id,
|
|
"format": "nru",
|
|
"pdf_path": pdf_path,
|
|
"solution_path": solution_path,
|
|
"statistics": {
|
|
"total_entries": len(vocab_dicts),
|
|
"vocabulary_count": len(vocab_list),
|
|
"sentence_count": len(sentence_list),
|
|
"source_pages": pages,
|
|
"worksheet_pages": len(pages) * 2, # 2 pages per source page
|
|
},
|
|
"download_url": f"/api/v1/vocab/worksheets/{worksheet_id}/pdf",
|
|
"solution_url": f"/api/v1/vocab/worksheets/{worksheet_id}/solution" if solution_path else None,
|
|
}
|
|
|
|
except ImportError as e:
|
|
logger.error(f"NRU generator not available: {e}")
|
|
raise HTTPException(status_code=500, detail="NRU worksheet generator not available")
|
|
except Exception as e:
|
|
logger.error(f"NRU worksheet generation failed: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail=f"Worksheet generation failed: {str(e)}")
|