Files
breakpilot-lehrer/klausur-service/backend/worksheet_editor_reconstruct.py
Benjamin Admin b2a0126f14 [split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths):
- grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones)
- cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab)
- worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes)
- legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion)
- cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel)
- cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel)
- rbac.py, admin_api.py, routes/eh.py remain (next batch)

backend-lehrer (1 monolith):
- classroom_engine/repository.py (1,705 → 7 files by domain)

All re-export barrels preserve backward compatibility.
Zero import errors verified.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 22:47:59 +02:00

256 lines
8.3 KiB
Python

"""
Worksheet Editor Reconstruct — Document reconstruction from vocab sessions.
"""
import io
import uuid
import base64
import logging
from typing import List, Dict
import numpy as np
from worksheet_editor_models import (
ReconstructRequest,
ReconstructResponse,
)
logger = logging.getLogger(__name__)
async def reconstruct_document_logic(request: ReconstructRequest) -> ReconstructResponse:
"""
Reconstruct a document from a vocab session into Fabric.js canvas format.
This function:
1. Loads the original PDF from the vocab session
2. Runs OCR with position tracking
3. Creates Fabric.js canvas JSON with positioned elements
4. Maps extracted vocabulary to their positions
Returns ReconstructResponse ready to send to the client.
"""
from fastapi import HTTPException
from vocab_worksheet_api import _sessions, convert_pdf_page_to_image
# Check if session exists
if request.session_id not in _sessions:
raise HTTPException(status_code=404, detail=f"Session {request.session_id} not found")
session = _sessions[request.session_id]
if not session.get("pdf_data"):
raise HTTPException(status_code=400, detail="Session has no PDF data")
pdf_data = session["pdf_data"]
page_count = session.get("pdf_page_count", 1)
if request.page_number < 1 or request.page_number > page_count:
raise HTTPException(
status_code=400,
detail=f"Page {request.page_number} not found. PDF has {page_count} pages."
)
vocabulary = session.get("vocabulary", [])
page_vocab = [v for v in vocabulary if v.get("source_page") == request.page_number]
logger.info(f"Reconstructing page {request.page_number} from session {request.session_id}")
logger.info(f"Found {len(page_vocab)} vocabulary items for this page")
image_bytes = await convert_pdf_page_to_image(pdf_data, request.page_number)
if not image_bytes:
raise HTTPException(status_code=500, detail="Failed to convert PDF page to image")
from PIL import Image
img = Image.open(io.BytesIO(image_bytes))
img_width, img_height = img.size
from hybrid_vocab_extractor import run_paddle_ocr
ocr_regions, raw_text = run_paddle_ocr(image_bytes)
logger.info(f"OCR found {len(ocr_regions)} text regions")
A4_WIDTH = 794
A4_HEIGHT = 1123
scale_x = A4_WIDTH / img_width
scale_y = A4_HEIGHT / img_height
fabric_objects = []
# 1. Add white background
fabric_objects.append({
"type": "rect", "left": 0, "top": 0,
"width": A4_WIDTH, "height": A4_HEIGHT,
"fill": "#ffffff", "selectable": False,
"evented": False, "isBackground": True
})
# 2. Group OCR regions by Y-coordinate to detect rows
sorted_regions = sorted(ocr_regions, key=lambda r: (r.y1, r.x1))
# 3. Detect headers (larger text at top)
headers = []
for region in sorted_regions:
height = region.y2 - region.y1
if region.y1 < img_height * 0.15 and height > 30:
headers.append(region)
# 4. Create text objects for each region
vocab_matched = 0
for region in sorted_regions:
left = int(region.x1 * scale_x)
top = int(region.y1 * scale_y)
is_header = region in headers
region_height = region.y2 - region.y1
base_font_size = max(10, min(32, int(region_height * scale_y * 0.8)))
if is_header:
base_font_size = max(base_font_size, 24)
is_vocab = False
vocab_match = None
for v in page_vocab:
if v.get("english", "").lower() in region.text.lower() or \
v.get("german", "").lower() in region.text.lower():
is_vocab = True
vocab_match = v
vocab_matched += 1
break
text_obj = {
"type": "i-text",
"id": f"text_{uuid.uuid4().hex[:8]}",
"left": left, "top": top,
"text": region.text,
"fontFamily": "Arial",
"fontSize": base_font_size,
"fontWeight": "bold" if is_header else "normal",
"fill": "#000000",
"originX": "left", "originY": "top",
}
if is_vocab and vocab_match:
text_obj["isVocabulary"] = True
text_obj["vocabularyId"] = vocab_match.get("id")
text_obj["english"] = vocab_match.get("english")
text_obj["german"] = vocab_match.get("german")
fabric_objects.append(text_obj)
# 5. If include_images, detect and extract image regions
if request.include_images:
image_regions = await _detect_image_regions(image_bytes, ocr_regions, img_width, img_height)
for i, img_region in enumerate(image_regions):
img_x1 = int(img_region["x1"])
img_y1 = int(img_region["y1"])
img_x2 = int(img_region["x2"])
img_y2 = int(img_region["y2"])
cropped = img.crop((img_x1, img_y1, img_x2, img_y2))
buffer = io.BytesIO()
cropped.save(buffer, format='PNG')
buffer.seek(0)
img_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"
fabric_objects.append({
"type": "image",
"id": f"img_{uuid.uuid4().hex[:8]}",
"left": int(img_x1 * scale_x),
"top": int(img_y1 * scale_y),
"width": int((img_x2 - img_x1) * scale_x),
"height": int((img_y2 - img_y1) * scale_y),
"src": img_base64,
"scaleX": 1, "scaleY": 1,
})
import json
canvas_data = {
"version": "6.0.0",
"objects": fabric_objects,
"background": "#ffffff"
}
return ReconstructResponse(
canvas_json=json.dumps(canvas_data),
page_width=A4_WIDTH,
page_height=A4_HEIGHT,
elements_count=len(fabric_objects),
vocabulary_matched=vocab_matched,
message=f"Reconstructed page {request.page_number} with {len(fabric_objects)} elements, "
f"{vocab_matched} vocabulary items matched"
)
async def _detect_image_regions(
image_bytes: bytes,
ocr_regions: list,
img_width: int,
img_height: int
) -> List[Dict]:
"""
Detect image/graphic regions in the document.
Uses a simple approach:
1. Find large gaps between text regions (potential image areas)
2. Use edge detection to find bounded regions
3. Filter out text areas
"""
from PIL import Image
import cv2
try:
img = Image.open(io.BytesIO(image_bytes))
img_array = np.array(img.convert('L'))
text_mask = np.ones_like(img_array, dtype=bool)
for region in ocr_regions:
x1 = max(0, region.x1 - 5)
y1 = max(0, region.y1 - 5)
x2 = min(img_width, region.x2 + 5)
y2 = min(img_height, region.y2 + 5)
text_mask[y1:y2, x1:x2] = False
image_regions = []
edges = cv2.Canny(img_array, 50, 150)
edges[~text_mask] = 0
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
if w > 50 and h > 50:
if w < img_width * 0.9 and h < img_height * 0.9:
region_content = img_array[y:y+h, x:x+w]
variance = np.var(region_content)
if variance > 500:
image_regions.append({
"x1": x, "y1": y,
"x2": x + w, "y2": y + h
})
filtered_regions = []
for region in sorted(image_regions, key=lambda r: (r["x2"]-r["x1"])*(r["y2"]-r["y1"]), reverse=True):
overlaps = False
for existing in filtered_regions:
if not (region["x2"] < existing["x1"] or region["x1"] > existing["x2"] or
region["y2"] < existing["y1"] or region["y1"] > existing["y2"]):
overlaps = True
break
if not overlaps:
filtered_regions.append(region)
logger.info(f"Detected {len(filtered_regions)} image regions")
return filtered_regions[:10]
except Exception as e:
logger.warning(f"Image region detection failed: {e}")
return []