breakpilot-lehrer/klausur-service/backend/worksheet_editor_reconstruct.py

"""
Worksheet Editor Reconstruct — Document reconstruction from vocab sessions.
"""

import io
import uuid
import base64
import logging
from typing import List, Dict

import numpy as np

from worksheet_editor_models import (
    ReconstructRequest,
    ReconstructResponse,
)

logger = logging.getLogger(__name__)


async def reconstruct_document_logic(request: ReconstructRequest) -> ReconstructResponse:
    """
    Reconstruct a document from a vocab session into Fabric.js canvas format.

    This function:
    1. Loads the original PDF from the vocab session
    2. Runs OCR with position tracking
    3. Creates Fabric.js canvas JSON with positioned elements
    4. Maps extracted vocabulary to their positions

    Returns ReconstructResponse ready to send to the client.
    """
    from fastapi import HTTPException
    from vocab_worksheet_api import _sessions, convert_pdf_page_to_image

    # Check if session exists
    if request.session_id not in _sessions:
        raise HTTPException(status_code=404, detail=f"Session {request.session_id} not found")

    session = _sessions[request.session_id]

    if not session.get("pdf_data"):
        raise HTTPException(status_code=400, detail="Session has no PDF data")

    pdf_data = session["pdf_data"]
    page_count = session.get("pdf_page_count", 1)

    if request.page_number < 1 or request.page_number > page_count:
        raise HTTPException(
            status_code=400,
            detail=f"Page {request.page_number} not found. PDF has {page_count} pages."
        )

    vocabulary = session.get("vocabulary", [])
    page_vocab = [v for v in vocabulary if v.get("source_page") == request.page_number]

    logger.info(f"Reconstructing page {request.page_number} from session {request.session_id}")
    logger.info(f"Found {len(page_vocab)} vocabulary items for this page")

    image_bytes = await convert_pdf_page_to_image(pdf_data, request.page_number)
    if not image_bytes:
        raise HTTPException(status_code=500, detail="Failed to convert PDF page to image")

    from PIL import Image
    img = Image.open(io.BytesIO(image_bytes))
    img_width, img_height = img.size

    from hybrid_vocab_extractor import run_paddle_ocr
    ocr_regions, raw_text = run_paddle_ocr(image_bytes)

    logger.info(f"OCR found {len(ocr_regions)} text regions")

    A4_WIDTH = 794
    A4_HEIGHT = 1123
    scale_x = A4_WIDTH / img_width
    scale_y = A4_HEIGHT / img_height

    fabric_objects = []

    # 1. Add white background
    fabric_objects.append({
        "type": "rect", "left": 0, "top": 0,
        "width": A4_WIDTH, "height": A4_HEIGHT,
        "fill": "#ffffff", "selectable": False,
        "evented": False, "isBackground": True
    })

    # 2. Group OCR regions by Y-coordinate to detect rows
    sorted_regions = sorted(ocr_regions, key=lambda r: (r.y1, r.x1))

    # 3. Detect headers (larger text at top)
    headers = []
    for region in sorted_regions:
        height = region.y2 - region.y1
        if region.y1 < img_height * 0.15 and height > 30:
            headers.append(region)

    # 4. Create text objects for each region
    vocab_matched = 0

    for region in sorted_regions:
        left = int(region.x1 * scale_x)
        top = int(region.y1 * scale_y)

        is_header = region in headers

        region_height = region.y2 - region.y1
        base_font_size = max(10, min(32, int(region_height * scale_y * 0.8)))

        if is_header:
            base_font_size = max(base_font_size, 24)

        is_vocab = False
        vocab_match = None
        for v in page_vocab:
            if v.get("english", "").lower() in region.text.lower() or \
               v.get("german", "").lower() in region.text.lower():
                is_vocab = True
                vocab_match = v
                vocab_matched += 1
                break

        text_obj = {
            "type": "i-text",
            "id": f"text_{uuid.uuid4().hex[:8]}",
            "left": left, "top": top,
            "text": region.text,
            "fontFamily": "Arial",
            "fontSize": base_font_size,
            "fontWeight": "bold" if is_header else "normal",
            "fill": "#000000",
            "originX": "left", "originY": "top",
        }

        if is_vocab and vocab_match:
            text_obj["isVocabulary"] = True
            text_obj["vocabularyId"] = vocab_match.get("id")
            text_obj["english"] = vocab_match.get("english")
            text_obj["german"] = vocab_match.get("german")

        fabric_objects.append(text_obj)

    # 5. If include_images, detect and extract image regions
    if request.include_images:
        image_regions = await _detect_image_regions(image_bytes, ocr_regions, img_width, img_height)

        for i, img_region in enumerate(image_regions):
            img_x1 = int(img_region["x1"])
            img_y1 = int(img_region["y1"])
            img_x2 = int(img_region["x2"])
            img_y2 = int(img_region["y2"])

            cropped = img.crop((img_x1, img_y1, img_x2, img_y2))

            buffer = io.BytesIO()
            cropped.save(buffer, format='PNG')
            buffer.seek(0)
            img_base64 = f"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode('utf-8')}"

            fabric_objects.append({
                "type": "image",
                "id": f"img_{uuid.uuid4().hex[:8]}",
                "left": int(img_x1 * scale_x),
                "top": int(img_y1 * scale_y),
                "width": int((img_x2 - img_x1) * scale_x),
                "height": int((img_y2 - img_y1) * scale_y),
                "src": img_base64,
                "scaleX": 1, "scaleY": 1,
            })

    import json
    canvas_data = {
        "version": "6.0.0",
        "objects": fabric_objects,
        "background": "#ffffff"
    }

    return ReconstructResponse(
        canvas_json=json.dumps(canvas_data),
        page_width=A4_WIDTH,
        page_height=A4_HEIGHT,
        elements_count=len(fabric_objects),
        vocabulary_matched=vocab_matched,
        message=f"Reconstructed page {request.page_number} with {len(fabric_objects)} elements, "
                f"{vocab_matched} vocabulary items matched"
    )


async def _detect_image_regions(
    image_bytes: bytes,
    ocr_regions: list,
    img_width: int,
    img_height: int
) -> List[Dict]:
    """
    Detect image/graphic regions in the document.

    Uses a simple approach:
    1. Find large gaps between text regions (potential image areas)
    2. Use edge detection to find bounded regions
    3. Filter out text areas
    """
    from PIL import Image
    import cv2

    try:
        img = Image.open(io.BytesIO(image_bytes))
        img_array = np.array(img.convert('L'))

        text_mask = np.ones_like(img_array, dtype=bool)
        for region in ocr_regions:
            x1 = max(0, region.x1 - 5)
            y1 = max(0, region.y1 - 5)
            x2 = min(img_width, region.x2 + 5)
            y2 = min(img_height, region.y2 + 5)
            text_mask[y1:y2, x1:x2] = False

        image_regions = []

        edges = cv2.Canny(img_array, 50, 150)
        edges[~text_mask] = 0

        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)

            if w > 50 and h > 50:
                if w < img_width * 0.9 and h < img_height * 0.9:
                    region_content = img_array[y:y+h, x:x+w]
                    variance = np.var(region_content)

                    if variance > 500:
                        image_regions.append({
                            "x1": x, "y1": y,
                            "x2": x + w, "y2": y + h
                        })

        filtered_regions = []
        for region in sorted(image_regions, key=lambda r: (r["x2"]-r["x1"])*(r["y2"]-r["y1"]), reverse=True):
            overlaps = False
            for existing in filtered_regions:
                if not (region["x2"] < existing["x1"] or region["x1"] > existing["x2"] or
                        region["y2"] < existing["y1"] or region["y1"] > existing["y2"]):
                    overlaps = True
                    break
            if not overlaps:
                filtered_regions.append(region)

        logger.info(f"Detected {len(filtered_regions)} image regions")
        return filtered_regions[:10]

    except Exception as e:
        logger.warning(f"Image region detection failed: {e}")
        return []