Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
376 lines
10 KiB
Python
376 lines
10 KiB
Python
"""
|
||
Layout Reconstruction Service for Worksheet Cleanup
|
||
|
||
Reconstructs the layout of a worksheet from an image:
|
||
1. Uses PaddleOCR to detect text with bounding boxes
|
||
2. Groups text into logical elements (headings, paragraphs, tables)
|
||
3. Generates Fabric.js compatible JSON for the worksheet editor
|
||
|
||
DATENSCHUTZ: All processing happens locally on Mac Mini.
|
||
"""
|
||
|
||
import numpy as np
|
||
from PIL import Image
|
||
import io
|
||
import json
|
||
import logging
|
||
from typing import List, Dict, Any, Optional, Tuple
|
||
from dataclasses import dataclass, field
|
||
from enum import Enum
|
||
|
||
# OpenCV is optional - only required for actual layout reconstruction
|
||
try:
|
||
import cv2
|
||
CV2_AVAILABLE = True
|
||
except ImportError:
|
||
cv2 = None
|
||
CV2_AVAILABLE = False
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class ElementType(str, Enum):
|
||
"""Types of detected layout elements."""
|
||
HEADING = "heading"
|
||
PARAGRAPH = "paragraph"
|
||
TEXT_LINE = "text_line"
|
||
TABLE = "table"
|
||
LIST_ITEM = "list_item"
|
||
FORM_FIELD = "form_field"
|
||
IMAGE = "image"
|
||
|
||
|
||
@dataclass
|
||
class TextElement:
|
||
"""A detected text element with position."""
|
||
text: str
|
||
x: float # Left position (pixels)
|
||
y: float # Top position (pixels)
|
||
width: float
|
||
height: float
|
||
confidence: float
|
||
element_type: ElementType = ElementType.TEXT_LINE
|
||
font_size: float = 14.0
|
||
is_bold: bool = False
|
||
is_centered: bool = False
|
||
|
||
|
||
@dataclass
|
||
class LayoutResult:
|
||
"""Result of layout reconstruction."""
|
||
elements: List[TextElement]
|
||
page_width: int
|
||
page_height: int
|
||
fabric_json: Dict[str, Any]
|
||
table_regions: List[Dict[str, Any]] = field(default_factory=list)
|
||
|
||
|
||
def reconstruct_layout(
|
||
image_bytes: bytes,
|
||
detect_tables: bool = True
|
||
) -> LayoutResult:
|
||
"""
|
||
Reconstruct the layout of a worksheet from an image.
|
||
|
||
Args:
|
||
image_bytes: Image as bytes
|
||
detect_tables: Whether to detect table structures
|
||
|
||
Returns:
|
||
LayoutResult with elements and Fabric.js JSON
|
||
|
||
Raises:
|
||
ImportError: If OpenCV is not available
|
||
"""
|
||
if not CV2_AVAILABLE:
|
||
raise ImportError(
|
||
"OpenCV (cv2) is required for layout reconstruction. "
|
||
"Install with: pip install opencv-python-headless"
|
||
)
|
||
|
||
# Load image
|
||
img = Image.open(io.BytesIO(image_bytes))
|
||
img_array = np.array(img)
|
||
page_height, page_width = img_array.shape[:2]
|
||
|
||
# Run PaddleOCR to get text with positions
|
||
ocr_results = _run_paddle_ocr(image_bytes)
|
||
|
||
if not ocr_results:
|
||
logger.warning("No text detected by PaddleOCR")
|
||
return LayoutResult(
|
||
elements=[],
|
||
page_width=page_width,
|
||
page_height=page_height,
|
||
fabric_json={"version": "5.3.0", "objects": []}
|
||
)
|
||
|
||
# Convert OCR results to TextElements
|
||
elements = _convert_ocr_to_elements(ocr_results, page_width, page_height)
|
||
|
||
# Group elements into lines and detect headings
|
||
elements = _classify_elements(elements, page_width)
|
||
|
||
# Detect table regions if enabled
|
||
table_regions = []
|
||
if detect_tables:
|
||
table_regions = _detect_tables(img_array, elements)
|
||
|
||
# Generate Fabric.js JSON
|
||
fabric_json = _generate_fabric_json(elements, page_width, page_height)
|
||
|
||
logger.info(f"Layout reconstruction: {len(elements)} elements, "
|
||
f"{len(table_regions)} tables")
|
||
|
||
return LayoutResult(
|
||
elements=elements,
|
||
page_width=page_width,
|
||
page_height=page_height,
|
||
fabric_json=fabric_json,
|
||
table_regions=table_regions
|
||
)
|
||
|
||
|
||
def _run_paddle_ocr(image_bytes: bytes) -> List[Dict[str, Any]]:
|
||
"""
|
||
Run PaddleOCR on an image.
|
||
|
||
Returns list of {text, confidence, bbox} dicts.
|
||
"""
|
||
try:
|
||
from hybrid_vocab_extractor import run_paddle_ocr as paddle_ocr_func, OCRRegion
|
||
|
||
regions, _ = paddle_ocr_func(image_bytes)
|
||
|
||
return [
|
||
{
|
||
"text": r.text,
|
||
"confidence": r.confidence,
|
||
"bbox": [r.x1, r.y1, r.x2, r.y2]
|
||
}
|
||
for r in regions
|
||
]
|
||
except ImportError:
|
||
logger.error("PaddleOCR not available")
|
||
return []
|
||
except Exception as e:
|
||
logger.error(f"PaddleOCR failed: {e}")
|
||
return []
|
||
|
||
|
||
def _convert_ocr_to_elements(
|
||
ocr_results: List[Dict[str, Any]],
|
||
page_width: int,
|
||
page_height: int
|
||
) -> List[TextElement]:
|
||
"""
|
||
Convert raw OCR results to TextElements.
|
||
"""
|
||
elements = []
|
||
|
||
for result in ocr_results:
|
||
bbox = result["bbox"]
|
||
x1, y1, x2, y2 = bbox
|
||
|
||
# Calculate dimensions
|
||
width = x2 - x1
|
||
height = y2 - y1
|
||
|
||
# Estimate font size from height
|
||
font_size = max(8, min(72, height * 0.8))
|
||
|
||
element = TextElement(
|
||
text=result["text"],
|
||
x=x1,
|
||
y=y1,
|
||
width=width,
|
||
height=height,
|
||
confidence=result["confidence"],
|
||
font_size=font_size
|
||
)
|
||
elements.append(element)
|
||
|
||
return elements
|
||
|
||
|
||
def _classify_elements(
|
||
elements: List[TextElement],
|
||
page_width: int
|
||
) -> List[TextElement]:
|
||
"""
|
||
Classify elements as headings, paragraphs, etc.
|
||
"""
|
||
if not elements:
|
||
return elements
|
||
|
||
# Calculate average metrics
|
||
avg_font_size = sum(e.font_size for e in elements) / len(elements)
|
||
avg_y = sum(e.y for e in elements) / len(elements)
|
||
|
||
for element in elements:
|
||
# Detect headings (larger font, near top, possibly centered)
|
||
is_larger = element.font_size > avg_font_size * 1.3
|
||
is_near_top = element.y < avg_y * 0.3
|
||
is_centered = abs((element.x + element.width / 2) - page_width / 2) < page_width * 0.15
|
||
|
||
if is_larger and (is_near_top or is_centered):
|
||
element.element_type = ElementType.HEADING
|
||
element.is_bold = True
|
||
element.is_centered = is_centered
|
||
# Detect list items (start with bullet or number)
|
||
elif element.text.strip().startswith(('•', '-', '–', '*')) or \
|
||
(len(element.text) > 2 and element.text[0].isdigit() and element.text[1] in '.):'):
|
||
element.element_type = ElementType.LIST_ITEM
|
||
# Detect form fields (underscores or dotted lines)
|
||
elif '_____' in element.text or '.....' in element.text:
|
||
element.element_type = ElementType.FORM_FIELD
|
||
else:
|
||
element.element_type = ElementType.TEXT_LINE
|
||
|
||
return elements
|
||
|
||
|
||
def _detect_tables(
|
||
img_array: np.ndarray,
|
||
elements: List[TextElement]
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
Detect table regions in the image.
|
||
"""
|
||
tables = []
|
||
|
||
# Convert to grayscale
|
||
if len(img_array.shape) == 3:
|
||
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
||
else:
|
||
gray = img_array
|
||
|
||
# Detect horizontal and vertical lines
|
||
edges = cv2.Canny(gray, 50, 150)
|
||
|
||
# Detect lines using Hough transform
|
||
lines = cv2.HoughLinesP(
|
||
edges, 1, np.pi/180, threshold=100,
|
||
minLineLength=50, maxLineGap=10
|
||
)
|
||
|
||
if lines is None:
|
||
return tables
|
||
|
||
# Separate horizontal and vertical lines
|
||
horizontal_lines = []
|
||
vertical_lines = []
|
||
|
||
for line in lines:
|
||
x1, y1, x2, y2 = line[0]
|
||
angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
|
||
|
||
if angle < 10: # Horizontal
|
||
horizontal_lines.append((x1, y1, x2, y2))
|
||
elif angle > 80: # Vertical
|
||
vertical_lines.append((x1, y1, x2, y2))
|
||
|
||
# Find table regions (intersections of horizontal and vertical lines)
|
||
if len(horizontal_lines) >= 2 and len(vertical_lines) >= 2:
|
||
# Sort lines
|
||
horizontal_lines.sort(key=lambda l: l[1])
|
||
vertical_lines.sort(key=lambda l: l[0])
|
||
|
||
# Find bounding box of table
|
||
min_x = min(l[0] for l in vertical_lines)
|
||
max_x = max(l[2] for l in vertical_lines)
|
||
min_y = min(l[1] for l in horizontal_lines)
|
||
max_y = max(l[3] for l in horizontal_lines)
|
||
|
||
tables.append({
|
||
"x": min_x,
|
||
"y": min_y,
|
||
"width": max_x - min_x,
|
||
"height": max_y - min_y,
|
||
"rows": len(horizontal_lines) - 1,
|
||
"cols": len(vertical_lines) - 1
|
||
})
|
||
|
||
return tables
|
||
|
||
|
||
def _generate_fabric_json(
|
||
elements: List[TextElement],
|
||
page_width: int,
|
||
page_height: int
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Generate Fabric.js compatible JSON from elements.
|
||
"""
|
||
fabric_objects = []
|
||
|
||
for i, element in enumerate(elements):
|
||
fabric_obj = {
|
||
"type": "textbox",
|
||
"version": "5.3.0",
|
||
"originX": "left",
|
||
"originY": "top",
|
||
"left": element.x,
|
||
"top": element.y,
|
||
"width": max(element.width, 100),
|
||
"height": element.height,
|
||
"fill": "#000000",
|
||
"stroke": None,
|
||
"strokeWidth": 0,
|
||
"text": element.text,
|
||
"fontSize": element.font_size,
|
||
"fontWeight": "bold" if element.is_bold else "normal",
|
||
"fontFamily": "Arial",
|
||
"textAlign": "center" if element.is_centered else "left",
|
||
"underline": False,
|
||
"lineHeight": 1.2,
|
||
"charSpacing": 0,
|
||
"splitByGrapheme": False,
|
||
"editable": True,
|
||
"selectable": True,
|
||
"data": {
|
||
"elementType": element.element_type.value,
|
||
"confidence": element.confidence,
|
||
"originalIndex": i
|
||
}
|
||
}
|
||
fabric_objects.append(fabric_obj)
|
||
|
||
return {
|
||
"version": "5.3.0",
|
||
"objects": fabric_objects,
|
||
"background": "#ffffff"
|
||
}
|
||
|
||
|
||
def layout_to_fabric_json(layout_result: LayoutResult) -> str:
|
||
"""
|
||
Convert LayoutResult to JSON string for frontend.
|
||
"""
|
||
return json.dumps(layout_result.fabric_json, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def reconstruct_and_clean(
|
||
image_bytes: bytes,
|
||
remove_handwriting: bool = True
|
||
) -> Tuple[bytes, LayoutResult]:
|
||
"""
|
||
Full pipeline: clean handwriting and reconstruct layout.
|
||
|
||
Args:
|
||
image_bytes: Source image
|
||
remove_handwriting: Whether to remove handwriting first
|
||
|
||
Returns:
|
||
Tuple of (cleaned image bytes, layout result)
|
||
"""
|
||
if remove_handwriting:
|
||
from services.inpainting_service import remove_handwriting as clean_hw
|
||
cleaned_bytes, _ = clean_hw(image_bytes)
|
||
else:
|
||
cleaned_bytes = image_bytes
|
||
|
||
layout = reconstruct_layout(cleaned_bytes)
|
||
|
||
return cleaned_bytes, layout
|