breakpilot-lehrer/klausur-service/backend/services/layout_reconstruction_service.py

"""
Layout Reconstruction Service for Worksheet Cleanup

Reconstructs the layout of a worksheet from an image:
1. Uses PaddleOCR to detect text with bounding boxes
2. Groups text into logical elements (headings, paragraphs, tables)
3. Generates Fabric.js compatible JSON for the worksheet editor

DATENSCHUTZ: All processing happens locally on Mac Mini.
"""

import numpy as np
from PIL import Image
import io
import json
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, field
from enum import Enum

# OpenCV is optional - only required for actual layout reconstruction
try:
    import cv2
    CV2_AVAILABLE = True
except ImportError:
    cv2 = None
    CV2_AVAILABLE = False

logger = logging.getLogger(__name__)


class ElementType(str, Enum):
    """Types of detected layout elements."""
    HEADING = "heading"
    PARAGRAPH = "paragraph"
    TEXT_LINE = "text_line"
    TABLE = "table"
    LIST_ITEM = "list_item"
    FORM_FIELD = "form_field"
    IMAGE = "image"


@dataclass
class TextElement:
    """A detected text element with position."""
    text: str
    x: float  # Left position (pixels)
    y: float  # Top position (pixels)
    width: float
    height: float
    confidence: float
    element_type: ElementType = ElementType.TEXT_LINE
    font_size: float = 14.0
    is_bold: bool = False
    is_centered: bool = False


@dataclass
class LayoutResult:
    """Result of layout reconstruction."""
    elements: List[TextElement]
    page_width: int
    page_height: int
    fabric_json: Dict[str, Any]
    table_regions: List[Dict[str, Any]] = field(default_factory=list)


def reconstruct_layout(
    image_bytes: bytes,
    detect_tables: bool = True
) -> LayoutResult:
    """
    Reconstruct the layout of a worksheet from an image.

    Args:
        image_bytes: Image as bytes
        detect_tables: Whether to detect table structures

    Returns:
        LayoutResult with elements and Fabric.js JSON

    Raises:
        ImportError: If OpenCV is not available
    """
    if not CV2_AVAILABLE:
        raise ImportError(
            "OpenCV (cv2) is required for layout reconstruction. "
            "Install with: pip install opencv-python-headless"
        )

    # Load image
    img = Image.open(io.BytesIO(image_bytes))
    img_array = np.array(img)
    page_height, page_width = img_array.shape[:2]

    # Run PaddleOCR to get text with positions
    ocr_results = _run_paddle_ocr(image_bytes)

    if not ocr_results:
        logger.warning("No text detected by PaddleOCR")
        return LayoutResult(
            elements=[],
            page_width=page_width,
            page_height=page_height,
            fabric_json={"version": "5.3.0", "objects": []}
        )

    # Convert OCR results to TextElements
    elements = _convert_ocr_to_elements(ocr_results, page_width, page_height)

    # Group elements into lines and detect headings
    elements = _classify_elements(elements, page_width)

    # Detect table regions if enabled
    table_regions = []
    if detect_tables:
        table_regions = _detect_tables(img_array, elements)

    # Generate Fabric.js JSON
    fabric_json = _generate_fabric_json(elements, page_width, page_height)

    logger.info(f"Layout reconstruction: {len(elements)} elements, "
                f"{len(table_regions)} tables")

    return LayoutResult(
        elements=elements,
        page_width=page_width,
        page_height=page_height,
        fabric_json=fabric_json,
        table_regions=table_regions
    )


def _run_paddle_ocr(image_bytes: bytes) -> List[Dict[str, Any]]:
    """
    Run PaddleOCR on an image.

    Returns list of {text, confidence, bbox} dicts.
    """
    try:
        from hybrid_vocab_extractor import run_paddle_ocr as paddle_ocr_func, OCRRegion

        regions, _ = paddle_ocr_func(image_bytes)

        return [
            {
                "text": r.text,
                "confidence": r.confidence,
                "bbox": [r.x1, r.y1, r.x2, r.y2]
            }
            for r in regions
        ]
    except ImportError:
        logger.error("PaddleOCR not available")
        return []
    except Exception as e:
        logger.error(f"PaddleOCR failed: {e}")
        return []


def _convert_ocr_to_elements(
    ocr_results: List[Dict[str, Any]],
    page_width: int,
    page_height: int
) -> List[TextElement]:
    """
    Convert raw OCR results to TextElements.
    """
    elements = []

    for result in ocr_results:
        bbox = result["bbox"]
        x1, y1, x2, y2 = bbox

        # Calculate dimensions
        width = x2 - x1
        height = y2 - y1

        # Estimate font size from height
        font_size = max(8, min(72, height * 0.8))

        element = TextElement(
            text=result["text"],
            x=x1,
            y=y1,
            width=width,
            height=height,
            confidence=result["confidence"],
            font_size=font_size
        )
        elements.append(element)

    return elements


def _classify_elements(
    elements: List[TextElement],
    page_width: int
) -> List[TextElement]:
    """
    Classify elements as headings, paragraphs, etc.
    """
    if not elements:
        return elements

    # Calculate average metrics
    avg_font_size = sum(e.font_size for e in elements) / len(elements)
    avg_y = sum(e.y for e in elements) / len(elements)

    for element in elements:
        # Detect headings (larger font, near top, possibly centered)
        is_larger = element.font_size > avg_font_size * 1.3
        is_near_top = element.y < avg_y * 0.3
        is_centered = abs((element.x + element.width / 2) - page_width / 2) < page_width * 0.15

        if is_larger and (is_near_top or is_centered):
            element.element_type = ElementType.HEADING
            element.is_bold = True
            element.is_centered = is_centered
        # Detect list items (start with bullet or number)
        elif element.text.strip().startswith(('•', '-', '–', '*')) or \
             (len(element.text) > 2 and element.text[0].isdigit() and element.text[1] in '.):'):
            element.element_type = ElementType.LIST_ITEM
        # Detect form fields (underscores or dotted lines)
        elif '_____' in element.text or '.....' in element.text:
            element.element_type = ElementType.FORM_FIELD
        else:
            element.element_type = ElementType.TEXT_LINE

    return elements


def _detect_tables(
    img_array: np.ndarray,
    elements: List[TextElement]
) -> List[Dict[str, Any]]:
    """
    Detect table regions in the image.
    """
    tables = []

    # Convert to grayscale
    if len(img_array.shape) == 3:
        gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    else:
        gray = img_array

    # Detect horizontal and vertical lines
    edges = cv2.Canny(gray, 50, 150)

    # Detect lines using Hough transform
    lines = cv2.HoughLinesP(
        edges, 1, np.pi/180, threshold=100,
        minLineLength=50, maxLineGap=10
    )

    if lines is None:
        return tables

    # Separate horizontal and vertical lines
    horizontal_lines = []
    vertical_lines = []

    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)

        if angle < 10:  # Horizontal
            horizontal_lines.append((x1, y1, x2, y2))
        elif angle > 80:  # Vertical
            vertical_lines.append((x1, y1, x2, y2))

    # Find table regions (intersections of horizontal and vertical lines)
    if len(horizontal_lines) >= 2 and len(vertical_lines) >= 2:
        # Sort lines
        horizontal_lines.sort(key=lambda l: l[1])
        vertical_lines.sort(key=lambda l: l[0])

        # Find bounding box of table
        min_x = min(l[0] for l in vertical_lines)
        max_x = max(l[2] for l in vertical_lines)
        min_y = min(l[1] for l in horizontal_lines)
        max_y = max(l[3] for l in horizontal_lines)

        tables.append({
            "x": min_x,
            "y": min_y,
            "width": max_x - min_x,
            "height": max_y - min_y,
            "rows": len(horizontal_lines) - 1,
            "cols": len(vertical_lines) - 1
        })

    return tables


def _generate_fabric_json(
    elements: List[TextElement],
    page_width: int,
    page_height: int
) -> Dict[str, Any]:
    """
    Generate Fabric.js compatible JSON from elements.
    """
    fabric_objects = []

    for i, element in enumerate(elements):
        fabric_obj = {
            "type": "textbox",
            "version": "5.3.0",
            "originX": "left",
            "originY": "top",
            "left": element.x,
            "top": element.y,
            "width": max(element.width, 100),
            "height": element.height,
            "fill": "#000000",
            "stroke": None,
            "strokeWidth": 0,
            "text": element.text,
            "fontSize": element.font_size,
            "fontWeight": "bold" if element.is_bold else "normal",
            "fontFamily": "Arial",
            "textAlign": "center" if element.is_centered else "left",
            "underline": False,
            "lineHeight": 1.2,
            "charSpacing": 0,
            "splitByGrapheme": False,
            "editable": True,
            "selectable": True,
            "data": {
                "elementType": element.element_type.value,
                "confidence": element.confidence,
                "originalIndex": i
            }
        }
        fabric_objects.append(fabric_obj)

    return {
        "version": "5.3.0",
        "objects": fabric_objects,
        "background": "#ffffff"
    }


def layout_to_fabric_json(layout_result: LayoutResult) -> str:
    """
    Convert LayoutResult to JSON string for frontend.
    """
    return json.dumps(layout_result.fabric_json, ensure_ascii=False, indent=2)


def reconstruct_and_clean(
    image_bytes: bytes,
    remove_handwriting: bool = True
) -> Tuple[bytes, LayoutResult]:
    """
    Full pipeline: clean handwriting and reconstruct layout.

    Args:
        image_bytes: Source image
        remove_handwriting: Whether to remove handwriting first

    Returns:
        Tuple of (cleaned image bytes, layout result)
    """
    if remove_handwriting:
        from services.inpainting_service import remove_handwriting as clean_hw
        cleaned_bytes, _ = clean_hw(image_bytes)
    else:
        cleaned_bytes = image_bytes

    layout = reconstruct_layout(cleaned_bytes)

    return cleaned_bytes, layout