feat(ocr): Add CV Document Reconstruction Pipeline for vocabulary extraction

New OCR method using classical Computer Vision: high-res rendering (432 DPI),
deskew, dewarp, binarization, projection-profile layout analysis, multi-pass
Tesseract OCR with region-specific PSM, and Y-coordinate line alignment.
Includes bugfix for convert_pdf_to_image call (line 869) and 39 unit tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
BreakPilot Dev
2026-02-09 23:52:35 +01:00
parent 916ecef476
commit fa958d31f6
4 changed files with 2096 additions and 50 deletions

View File

@@ -42,6 +42,93 @@ except ImportError:
MINIO_AVAILABLE = False
logger.warning("MinIO storage not available, using local storage")
# Try to import Tesseract extractor
try:
from tesseract_vocab_extractor import (
extract_bounding_boxes, run_tesseract_pipeline,
match_positions_to_vocab, TESSERACT_AVAILABLE,
)
except ImportError:
TESSERACT_AVAILABLE = False
logger.warning("Tesseract extractor not available")
# Try to import CV Pipeline
try:
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
except ImportError:
CV_PIPELINE_AVAILABLE = False
logger.warning("CV vocab pipeline not available")
# Try to import Grid Detection Service
try:
from services.grid_detection_service import GridDetectionService
GRID_SERVICE_AVAILABLE = True
except ImportError:
GRID_SERVICE_AVAILABLE = False
logger.warning("Grid Detection Service not available")
# Database integration (used by main.py lifespan)
try:
from vocab_session_store import (
DATABASE_URL, get_pool, init_vocab_tables,
list_sessions_db, get_session_db,
)
except ImportError:
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
get_pool = None
init_vocab_tables = None
list_sessions_db = None
get_session_db = None
_db_pool = None
def set_db_pool(pool):
"""Set the database connection pool (called from main.py lifespan)."""
global _db_pool
_db_pool = pool
async def _init_vocab_table():
"""Initialize vocab tables in database."""
if init_vocab_tables:
try:
await init_vocab_tables()
logger.info("vocab_session_cache table ready")
except Exception as e:
logger.warning(f"Failed to init vocab tables: {e}")
else:
logger.info("vocab_session_cache table ready")
async def _load_all_sessions():
"""Load all vocab sessions from database into memory cache."""
if not list_sessions_db:
logger.info("Loaded 0 vocab sessions from database")
return
try:
sessions = await list_sessions_db(limit=500)
count = 0
for s in sessions:
sid = s.get("id") or s.get("session_id")
if sid and sid not in _sessions:
_sessions[sid] = {
"id": sid,
"name": s.get("name", ""),
"description": s.get("description", ""),
"status": s.get("status", "created"),
"vocabulary_count": s.get("vocabulary_count", 0),
"source_language": s.get("source_language", "en"),
"target_language": s.get("target_language", "de"),
"created_at": str(s.get("created_at", "")),
}
count += 1
logger.info(f"Loaded {count} vocab sessions from database")
except Exception as e:
logger.warning(f"Failed to load sessions from database: {e}")
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
# Local storage path
@@ -786,7 +873,7 @@ async def upload_image(
# Convert PDF to image if needed
if is_pdf:
logger.info("Converting PDF to image...")
content = await convert_pdf_to_image(content)
content = await convert_pdf_page_to_image(content, page_number=0)
logger.info(f"PDF converted, image size: {len(content)} bytes")
# Save image
@@ -1066,8 +1153,12 @@ async def upload_pdf_get_info(
@router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
async def get_pdf_thumbnail(session_id: str, page_number: int):
"""Get a thumbnail image of a specific PDF page."""
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
"""Get a thumbnail image of a specific PDF page.
Args:
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
"""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
@@ -1077,7 +1168,7 @@ async def get_pdf_thumbnail(session_id: str, page_number: int):
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
thumbnail = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=True)
thumbnail = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=not hires)
return StreamingResponse(
io.BytesIO(thumbnail),
@@ -1085,6 +1176,45 @@ async def get_pdf_thumbnail(session_id: str, page_number: int):
)
@router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
"""PDF page as PNG at arbitrary resolution (for editor view).
Args:
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
"""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"PDF page {page_number} rendered at zoom={zoom}: {len(png_data)} bytes")
except Exception as e:
logger.error(f"PDF page image failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
@router.post("/sessions/{session_id}/process-single-page/{page_number}")
async def process_single_page(
session_id: str,
@@ -1436,9 +1566,122 @@ async def compare_ocr_methods(session_id: str, page_number: int):
}
all_vocab_sets["vision_llm"] = set()
# --- Method: Local LLM (same as vision but noted separately if available) ---
# For now, we treat vision_llm as the primary method.
# Tesseract method can be added here if tesseract_vocab_extractor is available.
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
if TESSERACT_AVAILABLE:
try:
start = time.time()
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
duration = time.time() - start
tess_vocab = tess_result.get("vocabulary", [])
tess_words = tess_result.get("words", [])
# Store Tesseract words in session for later use (grid analysis, position matching)
session["tesseract_words"] = tess_words
session["tesseract_image_width"] = tess_result.get("image_width", 0)
session["tesseract_image_height"] = tess_result.get("image_height", 0)
session[f"tesseract_page_{page_number}"] = tess_result
vocab_list_tess = []
for v in tess_vocab:
vocab_list_tess.append({
"english": v.get("english", ""),
"german": v.get("german", ""),
"example": v.get("example", ""),
})
methods_results["tesseract"] = {
"name": "Tesseract OCR",
"model": "tesseract-ocr (eng+deu)",
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list_tess),
"vocabulary": vocab_list_tess,
"confidence": 0.7 if tess_vocab else 0,
"success": len(vocab_list_tess) > 0,
"error": tess_result.get("error"),
"word_count": tess_result.get("word_count", 0),
"columns_detected": len(tess_result.get("columns", [])),
}
all_vocab_sets["tesseract"] = {
(v["english"].lower().strip(), v["german"].lower().strip())
for v in vocab_list_tess if v["english"] and v["german"]
}
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
llm_vocab_with_bbox = match_positions_to_vocab(
tess_words,
methods_results["vision_llm"]["vocabulary"],
tess_result.get("image_width", 1),
tess_result.get("image_height", 1),
)
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
except Exception as e:
logger.error(f"Tesseract failed: {e}")
import traceback
logger.debug(traceback.format_exc())
methods_results["tesseract"] = {
"name": "Tesseract OCR",
"model": "tesseract-ocr",
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["tesseract"] = set()
# --- Method: CV Pipeline (Document Reconstruction) ---
if CV_PIPELINE_AVAILABLE:
try:
start = time.time()
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
duration = time.time() - start
cv_vocab = cv_result.vocabulary if not cv_result.error else []
vocab_list_cv = []
for v in cv_vocab:
vocab_list_cv.append({
"english": v.get("english", ""),
"german": v.get("german", ""),
"example": v.get("example", ""),
})
methods_results["cv_pipeline"] = {
"name": "CV Pipeline (Document Reconstruction)",
"model": "opencv + tesseract (multi-pass)",
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list_cv),
"vocabulary": vocab_list_cv,
"confidence": 0.8 if cv_vocab else 0,
"success": len(vocab_list_cv) > 0,
"error": cv_result.error,
"word_count": cv_result.word_count,
"columns_detected": cv_result.columns_detected,
"stages": cv_result.stages,
}
all_vocab_sets["cv_pipeline"] = {
(v["english"].lower().strip(), v["german"].lower().strip())
for v in vocab_list_cv if v["english"] and v["german"]
}
except Exception as e:
logger.error(f"CV Pipeline failed: {e}")
import traceback
logger.debug(traceback.format_exc())
methods_results["cv_pipeline"] = {
"name": "CV Pipeline (Document Reconstruction)",
"model": "opencv + tesseract (multi-pass)",
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["cv_pipeline"] = set()
# --- Build comparison ---
all_unique = set()
@@ -1461,11 +1704,6 @@ async def compare_ocr_methods(session_id: str, page_number: int):
# Find best method
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
# Save vocabulary from best method in session for grid analysis (no second Ollama call needed)
best_vocab_list = methods_results.get(best_method, {}).get("vocabulary", [])
session["comparison_vocabulary"] = best_vocab_list
session["comparison_page"] = page_number
return {
"session_id": session_id,
"page_number": page_number,
@@ -1484,67 +1722,208 @@ async def compare_ocr_methods(session_id: str, page_number: int):
@router.post("/sessions/{session_id}/analyze-grid/{page_number}")
async def analyze_grid(session_id: str, page_number: int):
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
"""
Build grid structure from comparison results (no Ollama call needed).
Analyze the grid/table structure of a vocabulary page.
Uses vocabulary stored in session by compare-ocr to compute
the grid layout instantly.
Hybrid approach:
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
real spatial positions via GridDetectionService.
2. Otherwise fall back to Vision LLM for grid structure detection.
page_number is 0-indexed.
Returns GridData structure expected by the frontend GridOverlay component.
"""
logger.info(f"Grid analysis for session {session_id}, page {page_number}")
import httpx
import time
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
pdf_data = session.get("pdf_data")
# Read vocabulary from session (saved by compare-ocr)
vocab_list = session.get("comparison_vocabulary")
if not vocab_list:
return {"success": False, "error": "Bitte zuerst Vergleich starten, bevor die Grid-Analyse ausgefuehrt wird."}
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number.")
# Convert page to image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
tess_page_data = session.get(f"tesseract_page_{page_number}")
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
try:
# Run Tesseract if not already cached
if not tess_page_data:
logger.info("Running Tesseract for grid analysis (not cached)")
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
tess_page_data = await _run_tess(image_data, lang="eng+deu")
session[f"tesseract_page_{page_number}"] = tess_page_data
session["tesseract_words"] = tess_page_data.get("words", [])
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
tess_words = tess_page_data.get("words", [])
img_w = tess_page_data.get("image_width", 0)
img_h = tess_page_data.get("image_height", 0)
if tess_words and img_w > 0 and img_h > 0:
service = GridDetectionService()
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
if regions:
grid_result = service.detect_grid(regions)
grid_dict = grid_result.to_dict()
# Merge LLM text if available (better quality than Tesseract text)
# The LLM vocab was stored during compare-ocr
grid_dict["source"] = "tesseract+grid_service"
grid_dict["word_count"] = len(tess_words)
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
f"{grid_result.stats.get('recognized', 0)} recognized")
return {"success": True, "grid": grid_dict}
logger.info("Tesseract data insufficient, falling back to LLM")
except Exception as e:
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
import traceback
logger.debug(traceback.format_exc())
# --- Strategy 2: Fall back to Vision LLM ---
image_base64 = base64.b64encode(image_data).decode("utf-8")
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
Return a JSON object with this EXACT structure:
{
"rows": <number of rows>,
"columns": <number of columns>,
"column_types": ["english", "german", "example"],
"entries": [
{
"row": 0,
"col": 0,
"text": "the word or phrase in this cell",
"column_type": "english",
"confidence": 0.95
}
]
}
Rules:
- row and col are 0-indexed
- column_type is one of: "english", "german", "example", "unknown"
- Detect whether each column contains English words, German translations, or example sentences
- Include ALL non-empty cells
- confidence is 0.0-1.0 based on how clear the text is
- If a cell is empty, don't include it
- Return ONLY the JSON, no other text"""
try:
# Check if example column is present (at least one vocab has non-empty example)
has_examples = any(v.get("example", "").strip() for v in vocab_list)
num_cols = 3 if has_examples else 2
column_types = ["english", "german", "example"] if has_examples else ["english", "german"]
import asyncio
num_rows = len(vocab_list)
if num_rows == 0:
return {"success": False, "error": "Keine Vokabeln im Vergleichsergebnis gefunden."}
raw_text = ""
max_retries = 3
for attempt in range(max_retries):
async with httpx.AsyncClient(timeout=300.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/chat",
json={
"model": VISION_MODEL,
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
timeout=300.0,
)
if response.status_code == 500 and attempt < max_retries - 1:
wait_time = 10 * (attempt + 1)
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
await asyncio.sleep(wait_time)
continue
elif response.status_code != 200:
error_detail = response.text[:200] if response.text else "Unknown error"
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
raw_text = response.json().get("message", {}).get("content", "")
break
# Parse JSON from response
import re
json_match = re.search(r'\{[\s\S]*\}', raw_text)
if not json_match:
return {"success": False, "error": "Could not parse grid structure from LLM response"}
grid_raw = json.loads(json_match.group())
num_rows = grid_raw.get("rows", 0)
num_cols = grid_raw.get("columns", 0)
column_types = grid_raw.get("column_types", [])
entries = grid_raw.get("entries", [])
if num_rows == 0 or num_cols == 0:
return {"success": False, "error": "No grid structure detected"}
# Ensure column_types has the right length
while len(column_types) < num_cols:
column_types.append("unknown")
# Build cell grid with percentage-based coordinates
row_height = 100.0 / num_rows
col_width = 100.0 / num_cols
# Track which cells have content
cell_map = {}
for entry in entries:
r = entry.get("row", 0)
c = entry.get("col", 0)
cell_map[(r, c)] = entry
cells = []
recognized_count = 0
empty_count = 0
problematic_count = 0
for r, vocab in enumerate(vocab_list):
for r in range(num_rows):
row_cells = []
english = vocab.get("english", "").strip()
german = vocab.get("german", "").strip()
example = vocab.get("example", "").strip() if has_examples else None
col_values = [("english", english), ("german", german)]
if has_examples:
col_values.append(("example", example))
for c, (col_type, text) in enumerate(col_values):
for c in range(num_cols):
x = c * col_width
y = r * row_height
if text:
status = "recognized"
recognized_count += 1
conf = 0.9
if (r, c) in cell_map:
entry = cell_map[(r, c)]
text = entry.get("text", "").strip()
conf = entry.get("confidence", 0.8)
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
if text:
status = "recognized" if conf >= 0.5 else "problematic"
if status == "recognized":
recognized_count += 1
else:
problematic_count += 1
else:
status = "empty"
empty_count += 1
else:
text = ""
conf = 0.0
col_type = column_types[c] if c < len(column_types) else "unknown"
status = "empty"
empty_count += 1
conf = 0.0
row_cells.append({
"row": r,
@@ -1553,7 +1932,7 @@ async def analyze_grid(session_id: str, page_number: int):
"y": round(y, 2),
"width": round(col_width, 2),
"height": round(row_height, 2),
"text": text or "",
"text": text,
"confidence": conf,
"status": status,
"column_type": col_type,
@@ -1561,8 +1940,9 @@ async def analyze_grid(session_id: str, page_number: int):
cells.append(row_cells)
total = num_rows * num_cols
coverage = recognized_count / max(total, 1)
coverage = (recognized_count + problematic_count) / max(total, 1)
# Column and row boundaries as percentages
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
@@ -1574,9 +1954,10 @@ async def analyze_grid(session_id: str, page_number: int):
"column_boundaries": col_boundaries,
"row_boundaries": row_boundaries,
"deskew_angle": 0.0,
"source": "vision_llm",
"stats": {
"recognized": recognized_count,
"problematic": 0,
"problematic": problematic_count,
"empty": empty_count,
"manual": 0,
"total": total,
@@ -1586,6 +1967,9 @@ async def analyze_grid(session_id: str, page_number: int):
return {"success": True, "grid": grid_data}
except httpx.TimeoutException:
logger.error("Grid analysis timed out")
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
except Exception as e:
logger.error(f"Grid analysis failed: {e}")
import traceback