Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
546 lines
21 KiB
Python
546 lines
21 KiB
Python
"""
|
|
Vocabulary Worksheet Compare & Grid Analysis API.
|
|
|
|
Split from vocab_worksheet_analysis_api.py — contains the two largest
|
|
route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
|
|
"""
|
|
|
|
from fastapi import APIRouter, HTTPException, Query
|
|
from typing import Dict, Any
|
|
import base64
|
|
import json
|
|
import logging
|
|
import os
|
|
|
|
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
|
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
|
|
|
|
def _get_sessions():
|
|
from vocab_worksheet_api import _sessions
|
|
return _sessions
|
|
from vocab_worksheet_generation import convert_pdf_page_to_image
|
|
|
|
# Try to import Tesseract extractor
|
|
try:
|
|
from tesseract_vocab_extractor import (
|
|
run_tesseract_pipeline,
|
|
match_positions_to_vocab, TESSERACT_AVAILABLE,
|
|
)
|
|
except ImportError:
|
|
TESSERACT_AVAILABLE = False
|
|
|
|
# Try to import CV Pipeline
|
|
try:
|
|
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
|
|
except ImportError:
|
|
CV_PIPELINE_AVAILABLE = False
|
|
|
|
# Try to import Grid Detection Service
|
|
try:
|
|
from services.grid_detection_service import GridDetectionService
|
|
GRID_SERVICE_AVAILABLE = True
|
|
except ImportError:
|
|
GRID_SERVICE_AVAILABLE = False
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
compare_router = APIRouter()
|
|
|
|
|
|
# =============================================================================
|
|
# OCR Compare & Grid Analysis Endpoints
|
|
# =============================================================================
|
|
|
|
|
|
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
|
|
async def compare_ocr_methods(session_id: str, page_number: int):
|
|
"""
|
|
Run multiple OCR methods on a page and compare results.
|
|
|
|
This endpoint:
|
|
1. Gets the page image from the session's uploaded PDF
|
|
2. Runs Vision LLM extraction (primary method)
|
|
3. Optionally runs Tesseract extraction
|
|
4. Compares found vocabulary across methods
|
|
5. Returns structured comparison results
|
|
|
|
page_number is 0-indexed.
|
|
"""
|
|
import httpx
|
|
import time
|
|
|
|
logger.info(f"Compare OCR for session {session_id}, page {page_number}")
|
|
|
|
if session_id not in _get_sessions():
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _get_sessions()[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
|
|
|
# Convert page to image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
methods_results = {}
|
|
all_vocab_sets = {}
|
|
|
|
# --- Method: Vision LLM ---
|
|
try:
|
|
start = time.time()
|
|
vocab, confidence, error = await extract_vocabulary_from_image(
|
|
image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
|
|
)
|
|
duration = time.time() - start
|
|
|
|
vocab_list = []
|
|
for v in vocab:
|
|
entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
|
|
vocab_list.append({
|
|
"english": entry.get("english", ""),
|
|
"german": entry.get("german", ""),
|
|
"example": entry.get("example_sentence", ""),
|
|
})
|
|
|
|
methods_results["vision_llm"] = {
|
|
"name": "Vision LLM",
|
|
"model": VISION_MODEL,
|
|
"duration_seconds": round(duration, 1),
|
|
"vocabulary_count": len(vocab_list),
|
|
"vocabulary": vocab_list,
|
|
"confidence": confidence,
|
|
"success": len(vocab_list) > 0 and not error,
|
|
"error": error if error else None,
|
|
}
|
|
all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
|
|
except Exception as e:
|
|
logger.error(f"Vision LLM failed: {e}")
|
|
methods_results["vision_llm"] = {
|
|
"name": "Vision LLM",
|
|
"model": VISION_MODEL,
|
|
"duration_seconds": 0,
|
|
"vocabulary_count": 0,
|
|
"vocabulary": [],
|
|
"confidence": 0,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
all_vocab_sets["vision_llm"] = set()
|
|
|
|
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
|
|
if TESSERACT_AVAILABLE:
|
|
try:
|
|
start = time.time()
|
|
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
|
|
duration = time.time() - start
|
|
|
|
tess_vocab = tess_result.get("vocabulary", [])
|
|
tess_words = tess_result.get("words", [])
|
|
|
|
# Store Tesseract words in session for later use (grid analysis, position matching)
|
|
session["tesseract_words"] = tess_words
|
|
session["tesseract_image_width"] = tess_result.get("image_width", 0)
|
|
session["tesseract_image_height"] = tess_result.get("image_height", 0)
|
|
session[f"tesseract_page_{page_number}"] = tess_result
|
|
|
|
vocab_list_tess = []
|
|
for v in tess_vocab:
|
|
vocab_list_tess.append({
|
|
"english": v.get("english", ""),
|
|
"german": v.get("german", ""),
|
|
"example": v.get("example", ""),
|
|
})
|
|
|
|
methods_results["tesseract"] = {
|
|
"name": "Tesseract OCR",
|
|
"model": "tesseract-ocr (eng+deu)",
|
|
"duration_seconds": round(duration, 1),
|
|
"vocabulary_count": len(vocab_list_tess),
|
|
"vocabulary": vocab_list_tess,
|
|
"confidence": 0.7 if tess_vocab else 0,
|
|
"success": len(vocab_list_tess) > 0,
|
|
"error": tess_result.get("error"),
|
|
"word_count": tess_result.get("word_count", 0),
|
|
"columns_detected": len(tess_result.get("columns", [])),
|
|
}
|
|
all_vocab_sets["tesseract"] = {
|
|
(v["english"].lower().strip(), v["german"].lower().strip())
|
|
for v in vocab_list_tess if v["english"] and v["german"]
|
|
}
|
|
|
|
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
|
|
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
|
|
llm_vocab_with_bbox = match_positions_to_vocab(
|
|
tess_words,
|
|
methods_results["vision_llm"]["vocabulary"],
|
|
tess_result.get("image_width", 1),
|
|
tess_result.get("image_height", 1),
|
|
)
|
|
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
|
|
|
|
except Exception as e:
|
|
logger.error(f"Tesseract failed: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
methods_results["tesseract"] = {
|
|
"name": "Tesseract OCR",
|
|
"model": "tesseract-ocr",
|
|
"duration_seconds": 0,
|
|
"vocabulary_count": 0,
|
|
"vocabulary": [],
|
|
"confidence": 0,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
all_vocab_sets["tesseract"] = set()
|
|
|
|
# --- Method: CV Pipeline (Document Reconstruction) ---
|
|
if CV_PIPELINE_AVAILABLE:
|
|
try:
|
|
start = time.time()
|
|
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
|
|
duration = time.time() - start
|
|
|
|
cv_vocab = cv_result.vocabulary if not cv_result.error else []
|
|
vocab_list_cv = []
|
|
for v in cv_vocab:
|
|
vocab_list_cv.append({
|
|
"english": v.get("english", ""),
|
|
"german": v.get("german", ""),
|
|
"example": v.get("example", ""),
|
|
})
|
|
|
|
methods_results["cv_pipeline"] = {
|
|
"name": "CV Pipeline (Document Reconstruction)",
|
|
"model": "opencv + tesseract (multi-pass)",
|
|
"duration_seconds": round(duration, 1),
|
|
"vocabulary_count": len(vocab_list_cv),
|
|
"vocabulary": vocab_list_cv,
|
|
"confidence": 0.8 if cv_vocab else 0,
|
|
"success": len(vocab_list_cv) > 0,
|
|
"error": cv_result.error,
|
|
"word_count": cv_result.word_count,
|
|
"columns_detected": cv_result.columns_detected,
|
|
"stages": cv_result.stages,
|
|
}
|
|
all_vocab_sets["cv_pipeline"] = {
|
|
(v["english"].lower().strip(), v["german"].lower().strip())
|
|
for v in vocab_list_cv if v["english"] and v["german"]
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"CV Pipeline failed: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
methods_results["cv_pipeline"] = {
|
|
"name": "CV Pipeline (Document Reconstruction)",
|
|
"model": "opencv + tesseract (multi-pass)",
|
|
"duration_seconds": 0,
|
|
"vocabulary_count": 0,
|
|
"vocabulary": [],
|
|
"confidence": 0,
|
|
"success": False,
|
|
"error": str(e),
|
|
}
|
|
all_vocab_sets["cv_pipeline"] = set()
|
|
|
|
# --- Build comparison ---
|
|
all_unique = set()
|
|
for vs in all_vocab_sets.values():
|
|
all_unique |= vs
|
|
|
|
found_by_all = []
|
|
found_by_some = []
|
|
for english, german in sorted(all_unique):
|
|
found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
|
|
entry = {"english": english, "german": german, "methods": found_in}
|
|
if len(found_in) == len(all_vocab_sets):
|
|
found_by_all.append(entry)
|
|
else:
|
|
found_by_some.append(entry)
|
|
|
|
total_methods = max(len(all_vocab_sets), 1)
|
|
agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
|
|
|
|
# Find best method
|
|
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"page_number": page_number,
|
|
"methods": methods_results,
|
|
"comparison": {
|
|
"found_by_all_methods": found_by_all,
|
|
"found_by_some_methods": found_by_some,
|
|
"total_unique_vocabulary": len(all_unique),
|
|
"agreement_rate": agreement_rate,
|
|
},
|
|
"recommendation": {
|
|
"best_method": best_method,
|
|
"reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
|
|
},
|
|
}
|
|
|
|
|
|
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
|
|
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
|
|
"""
|
|
Analyze the grid/table structure of a vocabulary page.
|
|
|
|
Hybrid approach:
|
|
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
|
|
real spatial positions via GridDetectionService.
|
|
2. Otherwise fall back to Vision LLM for grid structure detection.
|
|
|
|
page_number is 0-indexed.
|
|
Returns GridData structure expected by the frontend GridOverlay component.
|
|
"""
|
|
import httpx
|
|
import time
|
|
|
|
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
|
|
|
|
if session_id not in _get_sessions():
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _get_sessions()[session_id]
|
|
pdf_data = session.get("pdf_data")
|
|
|
|
if not pdf_data:
|
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
|
|
|
page_count = session.get("pdf_page_count", 1)
|
|
if page_number < 0 or page_number >= page_count:
|
|
raise HTTPException(status_code=400, detail=f"Invalid page number.")
|
|
|
|
# Convert page to image
|
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
|
|
|
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
|
|
tess_page_data = session.get(f"tesseract_page_{page_number}")
|
|
|
|
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
|
|
try:
|
|
# Run Tesseract if not already cached
|
|
if not tess_page_data:
|
|
logger.info("Running Tesseract for grid analysis (not cached)")
|
|
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
|
|
tess_page_data = await _run_tess(image_data, lang="eng+deu")
|
|
session[f"tesseract_page_{page_number}"] = tess_page_data
|
|
session["tesseract_words"] = tess_page_data.get("words", [])
|
|
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
|
|
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
|
|
|
|
tess_words = tess_page_data.get("words", [])
|
|
img_w = tess_page_data.get("image_width", 0)
|
|
img_h = tess_page_data.get("image_height", 0)
|
|
|
|
if tess_words and img_w > 0 and img_h > 0:
|
|
service = GridDetectionService()
|
|
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
|
|
|
|
if regions:
|
|
grid_result = service.detect_grid(regions)
|
|
grid_dict = grid_result.to_dict()
|
|
|
|
# Merge LLM text if available (better quality than Tesseract text)
|
|
# The LLM vocab was stored during compare-ocr
|
|
grid_dict["source"] = "tesseract+grid_service"
|
|
grid_dict["word_count"] = len(tess_words)
|
|
|
|
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
|
|
f"{grid_result.stats.get('recognized', 0)} recognized")
|
|
|
|
return {"success": True, "grid": grid_dict}
|
|
|
|
logger.info("Tesseract data insufficient, falling back to LLM")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
|
|
# --- Strategy 2: Fall back to Vision LLM ---
|
|
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
|
|
|
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
|
|
|
|
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
|
|
|
|
Return a JSON object with this EXACT structure:
|
|
{
|
|
"rows": <number of rows>,
|
|
"columns": <number of columns>,
|
|
"column_types": ["english", "german", "example"],
|
|
"entries": [
|
|
{
|
|
"row": 0,
|
|
"col": 0,
|
|
"text": "the word or phrase in this cell",
|
|
"column_type": "english",
|
|
"confidence": 0.95
|
|
}
|
|
]
|
|
}
|
|
|
|
Rules:
|
|
- row and col are 0-indexed
|
|
- column_type is one of: "english", "german", "example", "unknown"
|
|
- Detect whether each column contains English words, German translations, or example sentences
|
|
- Include ALL non-empty cells
|
|
- confidence is 0.0-1.0 based on how clear the text is
|
|
- If a cell is empty, don't include it
|
|
- Return ONLY the JSON, no other text"""
|
|
|
|
try:
|
|
import asyncio
|
|
|
|
raw_text = ""
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
response = await client.post(
|
|
f"{OLLAMA_URL}/api/chat",
|
|
json={
|
|
"model": VISION_MODEL,
|
|
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 8192},
|
|
},
|
|
timeout=300.0,
|
|
)
|
|
|
|
if response.status_code == 500 and attempt < max_retries - 1:
|
|
wait_time = 10 * (attempt + 1)
|
|
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
|
await asyncio.sleep(wait_time)
|
|
continue
|
|
elif response.status_code != 200:
|
|
error_detail = response.text[:200] if response.text else "Unknown error"
|
|
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
|
|
|
|
raw_text = response.json().get("message", {}).get("content", "")
|
|
break
|
|
|
|
# Parse JSON from response
|
|
import re
|
|
json_match = re.search(r'\{[\s\S]*\}', raw_text)
|
|
if not json_match:
|
|
return {"success": False, "error": "Could not parse grid structure from LLM response"}
|
|
|
|
grid_raw = json.loads(json_match.group())
|
|
|
|
num_rows = grid_raw.get("rows", 0)
|
|
num_cols = grid_raw.get("columns", 0)
|
|
column_types = grid_raw.get("column_types", [])
|
|
entries = grid_raw.get("entries", [])
|
|
|
|
if num_rows == 0 or num_cols == 0:
|
|
return {"success": False, "error": "No grid structure detected"}
|
|
|
|
# Ensure column_types has the right length
|
|
while len(column_types) < num_cols:
|
|
column_types.append("unknown")
|
|
|
|
# Build cell grid with percentage-based coordinates
|
|
row_height = 100.0 / num_rows
|
|
col_width = 100.0 / num_cols
|
|
|
|
# Track which cells have content
|
|
cell_map = {}
|
|
for entry in entries:
|
|
r = entry.get("row", 0)
|
|
c = entry.get("col", 0)
|
|
cell_map[(r, c)] = entry
|
|
|
|
cells = []
|
|
recognized_count = 0
|
|
empty_count = 0
|
|
problematic_count = 0
|
|
|
|
for r in range(num_rows):
|
|
row_cells = []
|
|
for c in range(num_cols):
|
|
x = c * col_width
|
|
y = r * row_height
|
|
|
|
if (r, c) in cell_map:
|
|
entry = cell_map[(r, c)]
|
|
text = entry.get("text", "").strip()
|
|
conf = entry.get("confidence", 0.8)
|
|
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
|
|
|
|
if text:
|
|
status = "recognized" if conf >= 0.5 else "problematic"
|
|
if status == "recognized":
|
|
recognized_count += 1
|
|
else:
|
|
problematic_count += 1
|
|
else:
|
|
status = "empty"
|
|
empty_count += 1
|
|
else:
|
|
text = ""
|
|
conf = 0.0
|
|
col_type = column_types[c] if c < len(column_types) else "unknown"
|
|
status = "empty"
|
|
empty_count += 1
|
|
|
|
row_cells.append({
|
|
"row": r,
|
|
"col": c,
|
|
"x": round(x, 2),
|
|
"y": round(y, 2),
|
|
"width": round(col_width, 2),
|
|
"height": round(row_height, 2),
|
|
"text": text,
|
|
"confidence": conf,
|
|
"status": status,
|
|
"column_type": col_type,
|
|
})
|
|
cells.append(row_cells)
|
|
|
|
total = num_rows * num_cols
|
|
coverage = (recognized_count + problematic_count) / max(total, 1)
|
|
|
|
# Column and row boundaries as percentages
|
|
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
|
|
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
|
|
|
|
grid_data = {
|
|
"rows": num_rows,
|
|
"columns": num_cols,
|
|
"cells": cells,
|
|
"column_types": column_types,
|
|
"column_boundaries": col_boundaries,
|
|
"row_boundaries": row_boundaries,
|
|
"deskew_angle": 0.0,
|
|
"source": "vision_llm",
|
|
"stats": {
|
|
"recognized": recognized_count,
|
|
"problematic": problematic_count,
|
|
"empty": empty_count,
|
|
"manual": 0,
|
|
"total": total,
|
|
"coverage": round(coverage, 3),
|
|
},
|
|
}
|
|
|
|
return {"success": True, "grid": grid_data}
|
|
|
|
except httpx.TimeoutException:
|
|
logger.error("Grid analysis timed out")
|
|
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
|
|
except Exception as e:
|
|
logger.error(f"Grid analysis failed: {e}")
|
|
import traceback
|
|
logger.debug(traceback.format_exc())
|
|
return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
|