""" Layout Reconstruction Service for Worksheet Cleanup Reconstructs the layout of a worksheet from an image: 1. Uses PaddleOCR to detect text with bounding boxes 2. Groups text into logical elements (headings, paragraphs, tables) 3. Generates Fabric.js compatible JSON for the worksheet editor DATENSCHUTZ: All processing happens locally on Mac Mini. """ import numpy as np from PIL import Image import io import json import logging from typing import List, Dict, Any, Optional, Tuple from dataclasses import dataclass, field from enum import Enum # OpenCV is optional - only required for actual layout reconstruction try: import cv2 CV2_AVAILABLE = True except ImportError: cv2 = None CV2_AVAILABLE = False logger = logging.getLogger(__name__) class ElementType(str, Enum): """Types of detected layout elements.""" HEADING = "heading" PARAGRAPH = "paragraph" TEXT_LINE = "text_line" TABLE = "table" LIST_ITEM = "list_item" FORM_FIELD = "form_field" IMAGE = "image" @dataclass class TextElement: """A detected text element with position.""" text: str x: float # Left position (pixels) y: float # Top position (pixels) width: float height: float confidence: float element_type: ElementType = ElementType.TEXT_LINE font_size: float = 14.0 is_bold: bool = False is_centered: bool = False @dataclass class LayoutResult: """Result of layout reconstruction.""" elements: List[TextElement] page_width: int page_height: int fabric_json: Dict[str, Any] table_regions: List[Dict[str, Any]] = field(default_factory=list) def reconstruct_layout( image_bytes: bytes, detect_tables: bool = True ) -> LayoutResult: """ Reconstruct the layout of a worksheet from an image. Args: image_bytes: Image as bytes detect_tables: Whether to detect table structures Returns: LayoutResult with elements and Fabric.js JSON Raises: ImportError: If OpenCV is not available """ if not CV2_AVAILABLE: raise ImportError( "OpenCV (cv2) is required for layout reconstruction. " "Install with: pip install opencv-python-headless" ) # Load image img = Image.open(io.BytesIO(image_bytes)) img_array = np.array(img) page_height, page_width = img_array.shape[:2] # Run PaddleOCR to get text with positions ocr_results = _run_paddle_ocr(image_bytes) if not ocr_results: logger.warning("No text detected by PaddleOCR") return LayoutResult( elements=[], page_width=page_width, page_height=page_height, fabric_json={"version": "5.3.0", "objects": []} ) # Convert OCR results to TextElements elements = _convert_ocr_to_elements(ocr_results, page_width, page_height) # Group elements into lines and detect headings elements = _classify_elements(elements, page_width) # Detect table regions if enabled table_regions = [] if detect_tables: table_regions = _detect_tables(img_array, elements) # Generate Fabric.js JSON fabric_json = _generate_fabric_json(elements, page_width, page_height) logger.info(f"Layout reconstruction: {len(elements)} elements, " f"{len(table_regions)} tables") return LayoutResult( elements=elements, page_width=page_width, page_height=page_height, fabric_json=fabric_json, table_regions=table_regions ) def _run_paddle_ocr(image_bytes: bytes) -> List[Dict[str, Any]]: """ Run PaddleOCR on an image. Returns list of {text, confidence, bbox} dicts. """ try: from hybrid_vocab_extractor import run_paddle_ocr as paddle_ocr_func, OCRRegion regions, _ = paddle_ocr_func(image_bytes) return [ { "text": r.text, "confidence": r.confidence, "bbox": [r.x1, r.y1, r.x2, r.y2] } for r in regions ] except ImportError: logger.error("PaddleOCR not available") return [] except Exception as e: logger.error(f"PaddleOCR failed: {e}") return [] def _convert_ocr_to_elements( ocr_results: List[Dict[str, Any]], page_width: int, page_height: int ) -> List[TextElement]: """ Convert raw OCR results to TextElements. """ elements = [] for result in ocr_results: bbox = result["bbox"] x1, y1, x2, y2 = bbox # Calculate dimensions width = x2 - x1 height = y2 - y1 # Estimate font size from height font_size = max(8, min(72, height * 0.8)) element = TextElement( text=result["text"], x=x1, y=y1, width=width, height=height, confidence=result["confidence"], font_size=font_size ) elements.append(element) return elements def _classify_elements( elements: List[TextElement], page_width: int ) -> List[TextElement]: """ Classify elements as headings, paragraphs, etc. """ if not elements: return elements # Calculate average metrics avg_font_size = sum(e.font_size for e in elements) / len(elements) avg_y = sum(e.y for e in elements) / len(elements) for element in elements: # Detect headings (larger font, near top, possibly centered) is_larger = element.font_size > avg_font_size * 1.3 is_near_top = element.y < avg_y * 0.3 is_centered = abs((element.x + element.width / 2) - page_width / 2) < page_width * 0.15 if is_larger and (is_near_top or is_centered): element.element_type = ElementType.HEADING element.is_bold = True element.is_centered = is_centered # Detect list items (start with bullet or number) elif element.text.strip().startswith(('•', '-', '–', '*')) or \ (len(element.text) > 2 and element.text[0].isdigit() and element.text[1] in '.):'): element.element_type = ElementType.LIST_ITEM # Detect form fields (underscores or dotted lines) elif '_____' in element.text or '.....' in element.text: element.element_type = ElementType.FORM_FIELD else: element.element_type = ElementType.TEXT_LINE return elements def _detect_tables( img_array: np.ndarray, elements: List[TextElement] ) -> List[Dict[str, Any]]: """ Detect table regions in the image. """ tables = [] # Convert to grayscale if len(img_array.shape) == 3: gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) else: gray = img_array # Detect horizontal and vertical lines edges = cv2.Canny(gray, 50, 150) # Detect lines using Hough transform lines = cv2.HoughLinesP( edges, 1, np.pi/180, threshold=100, minLineLength=50, maxLineGap=10 ) if lines is None: return tables # Separate horizontal and vertical lines horizontal_lines = [] vertical_lines = [] for line in lines: x1, y1, x2, y2 = line[0] angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi) if angle < 10: # Horizontal horizontal_lines.append((x1, y1, x2, y2)) elif angle > 80: # Vertical vertical_lines.append((x1, y1, x2, y2)) # Find table regions (intersections of horizontal and vertical lines) if len(horizontal_lines) >= 2 and len(vertical_lines) >= 2: # Sort lines horizontal_lines.sort(key=lambda l: l[1]) vertical_lines.sort(key=lambda l: l[0]) # Find bounding box of table min_x = min(l[0] for l in vertical_lines) max_x = max(l[2] for l in vertical_lines) min_y = min(l[1] for l in horizontal_lines) max_y = max(l[3] for l in horizontal_lines) tables.append({ "x": min_x, "y": min_y, "width": max_x - min_x, "height": max_y - min_y, "rows": len(horizontal_lines) - 1, "cols": len(vertical_lines) - 1 }) return tables def _generate_fabric_json( elements: List[TextElement], page_width: int, page_height: int ) -> Dict[str, Any]: """ Generate Fabric.js compatible JSON from elements. """ fabric_objects = [] for i, element in enumerate(elements): fabric_obj = { "type": "textbox", "version": "5.3.0", "originX": "left", "originY": "top", "left": element.x, "top": element.y, "width": max(element.width, 100), "height": element.height, "fill": "#000000", "stroke": None, "strokeWidth": 0, "text": element.text, "fontSize": element.font_size, "fontWeight": "bold" if element.is_bold else "normal", "fontFamily": "Arial", "textAlign": "center" if element.is_centered else "left", "underline": False, "lineHeight": 1.2, "charSpacing": 0, "splitByGrapheme": False, "editable": True, "selectable": True, "data": { "elementType": element.element_type.value, "confidence": element.confidence, "originalIndex": i } } fabric_objects.append(fabric_obj) return { "version": "5.3.0", "objects": fabric_objects, "background": "#ffffff" } def layout_to_fabric_json(layout_result: LayoutResult) -> str: """ Convert LayoutResult to JSON string for frontend. """ return json.dumps(layout_result.fabric_json, ensure_ascii=False, indent=2) def cells_to_fabric_json( cells: List[Dict[str, Any]], image_width: int, image_height: int, ) -> Dict[str, Any]: """Convert pipeline grid cells to Fabric.js-compatible JSON. Each cell becomes a Textbox object positioned at its bbox_pct coordinates (converted to pixels). Colour-coded by column type. Args: cells: List of cell dicts from GridResult (with bbox_pct, col_type, text). image_width: Source image width in pixels. image_height: Source image height in pixels. Returns: Dict with Fabric.js canvas JSON (version + objects array). """ COL_TYPE_COLORS = { 'column_en': '#3b82f6', 'column_de': '#22c55e', 'column_example': '#f97316', 'column_text': '#a855f7', 'page_ref': '#06b6d4', 'column_marker': '#6b7280', } fabric_objects = [] for cell in cells: bp = cell.get('bbox_pct', {}) x = bp.get('x', 0) / 100 * image_width y = bp.get('y', 0) / 100 * image_height w = bp.get('w', 10) / 100 * image_width h = bp.get('h', 3) / 100 * image_height col_type = cell.get('col_type', '') color = COL_TYPE_COLORS.get(col_type, '#6b7280') font_size = max(8, min(18, h * 0.55)) fabric_objects.append({ "type": "textbox", "version": "6.0.0", "originX": "left", "originY": "top", "left": round(x, 1), "top": round(y, 1), "width": max(round(w, 1), 30), "height": round(h, 1), "fill": "#000000", "stroke": color, "strokeWidth": 1, "text": cell.get('text', ''), "fontSize": round(font_size, 1), "fontFamily": "monospace", "editable": True, "selectable": True, "backgroundColor": color + "22", "data": { "cellId": cell.get('cell_id', ''), "colType": col_type, "rowIndex": cell.get('row_index', 0), "colIndex": cell.get('col_index', 0), "originalText": cell.get('text', ''), }, }) return { "version": "6.0.0", "objects": fabric_objects, } def reconstruct_and_clean( image_bytes: bytes, remove_handwriting: bool = True ) -> Tuple[bytes, LayoutResult]: """ Full pipeline: clean handwriting and reconstruct layout. Args: image_bytes: Source image remove_handwriting: Whether to remove handwriting first Returns: Tuple of (cleaned image bytes, layout result) """ if remove_handwriting: from services.inpainting_service import remove_handwriting as clean_hw cleaned_bytes, _ = clean_hw(image_bytes) else: cleaned_bytes = image_bytes layout = reconstruct_layout(cleaned_bytes) return cleaned_bytes, layout