Files
breakpilot-lehrer/klausur-service/backend/services/layout_reconstruction_service.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

376 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Layout Reconstruction Service for Worksheet Cleanup
Reconstructs the layout of a worksheet from an image:
1. Uses PaddleOCR to detect text with bounding boxes
2. Groups text into logical elements (headings, paragraphs, tables)
3. Generates Fabric.js compatible JSON for the worksheet editor
DATENSCHUTZ: All processing happens locally on Mac Mini.
"""
import numpy as np
from PIL import Image
import io
import json
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass, field
from enum import Enum
# OpenCV is optional - only required for actual layout reconstruction
try:
import cv2
CV2_AVAILABLE = True
except ImportError:
cv2 = None
CV2_AVAILABLE = False
logger = logging.getLogger(__name__)
class ElementType(str, Enum):
"""Types of detected layout elements."""
HEADING = "heading"
PARAGRAPH = "paragraph"
TEXT_LINE = "text_line"
TABLE = "table"
LIST_ITEM = "list_item"
FORM_FIELD = "form_field"
IMAGE = "image"
@dataclass
class TextElement:
"""A detected text element with position."""
text: str
x: float # Left position (pixels)
y: float # Top position (pixels)
width: float
height: float
confidence: float
element_type: ElementType = ElementType.TEXT_LINE
font_size: float = 14.0
is_bold: bool = False
is_centered: bool = False
@dataclass
class LayoutResult:
"""Result of layout reconstruction."""
elements: List[TextElement]
page_width: int
page_height: int
fabric_json: Dict[str, Any]
table_regions: List[Dict[str, Any]] = field(default_factory=list)
def reconstruct_layout(
image_bytes: bytes,
detect_tables: bool = True
) -> LayoutResult:
"""
Reconstruct the layout of a worksheet from an image.
Args:
image_bytes: Image as bytes
detect_tables: Whether to detect table structures
Returns:
LayoutResult with elements and Fabric.js JSON
Raises:
ImportError: If OpenCV is not available
"""
if not CV2_AVAILABLE:
raise ImportError(
"OpenCV (cv2) is required for layout reconstruction. "
"Install with: pip install opencv-python-headless"
)
# Load image
img = Image.open(io.BytesIO(image_bytes))
img_array = np.array(img)
page_height, page_width = img_array.shape[:2]
# Run PaddleOCR to get text with positions
ocr_results = _run_paddle_ocr(image_bytes)
if not ocr_results:
logger.warning("No text detected by PaddleOCR")
return LayoutResult(
elements=[],
page_width=page_width,
page_height=page_height,
fabric_json={"version": "5.3.0", "objects": []}
)
# Convert OCR results to TextElements
elements = _convert_ocr_to_elements(ocr_results, page_width, page_height)
# Group elements into lines and detect headings
elements = _classify_elements(elements, page_width)
# Detect table regions if enabled
table_regions = []
if detect_tables:
table_regions = _detect_tables(img_array, elements)
# Generate Fabric.js JSON
fabric_json = _generate_fabric_json(elements, page_width, page_height)
logger.info(f"Layout reconstruction: {len(elements)} elements, "
f"{len(table_regions)} tables")
return LayoutResult(
elements=elements,
page_width=page_width,
page_height=page_height,
fabric_json=fabric_json,
table_regions=table_regions
)
def _run_paddle_ocr(image_bytes: bytes) -> List[Dict[str, Any]]:
"""
Run PaddleOCR on an image.
Returns list of {text, confidence, bbox} dicts.
"""
try:
from hybrid_vocab_extractor import run_paddle_ocr as paddle_ocr_func, OCRRegion
regions, _ = paddle_ocr_func(image_bytes)
return [
{
"text": r.text,
"confidence": r.confidence,
"bbox": [r.x1, r.y1, r.x2, r.y2]
}
for r in regions
]
except ImportError:
logger.error("PaddleOCR not available")
return []
except Exception as e:
logger.error(f"PaddleOCR failed: {e}")
return []
def _convert_ocr_to_elements(
ocr_results: List[Dict[str, Any]],
page_width: int,
page_height: int
) -> List[TextElement]:
"""
Convert raw OCR results to TextElements.
"""
elements = []
for result in ocr_results:
bbox = result["bbox"]
x1, y1, x2, y2 = bbox
# Calculate dimensions
width = x2 - x1
height = y2 - y1
# Estimate font size from height
font_size = max(8, min(72, height * 0.8))
element = TextElement(
text=result["text"],
x=x1,
y=y1,
width=width,
height=height,
confidence=result["confidence"],
font_size=font_size
)
elements.append(element)
return elements
def _classify_elements(
elements: List[TextElement],
page_width: int
) -> List[TextElement]:
"""
Classify elements as headings, paragraphs, etc.
"""
if not elements:
return elements
# Calculate average metrics
avg_font_size = sum(e.font_size for e in elements) / len(elements)
avg_y = sum(e.y for e in elements) / len(elements)
for element in elements:
# Detect headings (larger font, near top, possibly centered)
is_larger = element.font_size > avg_font_size * 1.3
is_near_top = element.y < avg_y * 0.3
is_centered = abs((element.x + element.width / 2) - page_width / 2) < page_width * 0.15
if is_larger and (is_near_top or is_centered):
element.element_type = ElementType.HEADING
element.is_bold = True
element.is_centered = is_centered
# Detect list items (start with bullet or number)
elif element.text.strip().startswith(('', '-', '', '*')) or \
(len(element.text) > 2 and element.text[0].isdigit() and element.text[1] in '.):'):
element.element_type = ElementType.LIST_ITEM
# Detect form fields (underscores or dotted lines)
elif '_____' in element.text or '.....' in element.text:
element.element_type = ElementType.FORM_FIELD
else:
element.element_type = ElementType.TEXT_LINE
return elements
def _detect_tables(
img_array: np.ndarray,
elements: List[TextElement]
) -> List[Dict[str, Any]]:
"""
Detect table regions in the image.
"""
tables = []
# Convert to grayscale
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
# Detect horizontal and vertical lines
edges = cv2.Canny(gray, 50, 150)
# Detect lines using Hough transform
lines = cv2.HoughLinesP(
edges, 1, np.pi/180, threshold=100,
minLineLength=50, maxLineGap=10
)
if lines is None:
return tables
# Separate horizontal and vertical lines
horizontal_lines = []
vertical_lines = []
for line in lines:
x1, y1, x2, y2 = line[0]
angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)
if angle < 10: # Horizontal
horizontal_lines.append((x1, y1, x2, y2))
elif angle > 80: # Vertical
vertical_lines.append((x1, y1, x2, y2))
# Find table regions (intersections of horizontal and vertical lines)
if len(horizontal_lines) >= 2 and len(vertical_lines) >= 2:
# Sort lines
horizontal_lines.sort(key=lambda l: l[1])
vertical_lines.sort(key=lambda l: l[0])
# Find bounding box of table
min_x = min(l[0] for l in vertical_lines)
max_x = max(l[2] for l in vertical_lines)
min_y = min(l[1] for l in horizontal_lines)
max_y = max(l[3] for l in horizontal_lines)
tables.append({
"x": min_x,
"y": min_y,
"width": max_x - min_x,
"height": max_y - min_y,
"rows": len(horizontal_lines) - 1,
"cols": len(vertical_lines) - 1
})
return tables
def _generate_fabric_json(
elements: List[TextElement],
page_width: int,
page_height: int
) -> Dict[str, Any]:
"""
Generate Fabric.js compatible JSON from elements.
"""
fabric_objects = []
for i, element in enumerate(elements):
fabric_obj = {
"type": "textbox",
"version": "5.3.0",
"originX": "left",
"originY": "top",
"left": element.x,
"top": element.y,
"width": max(element.width, 100),
"height": element.height,
"fill": "#000000",
"stroke": None,
"strokeWidth": 0,
"text": element.text,
"fontSize": element.font_size,
"fontWeight": "bold" if element.is_bold else "normal",
"fontFamily": "Arial",
"textAlign": "center" if element.is_centered else "left",
"underline": False,
"lineHeight": 1.2,
"charSpacing": 0,
"splitByGrapheme": False,
"editable": True,
"selectable": True,
"data": {
"elementType": element.element_type.value,
"confidence": element.confidence,
"originalIndex": i
}
}
fabric_objects.append(fabric_obj)
return {
"version": "5.3.0",
"objects": fabric_objects,
"background": "#ffffff"
}
def layout_to_fabric_json(layout_result: LayoutResult) -> str:
"""
Convert LayoutResult to JSON string for frontend.
"""
return json.dumps(layout_result.fabric_json, ensure_ascii=False, indent=2)
def reconstruct_and_clean(
image_bytes: bytes,
remove_handwriting: bool = True
) -> Tuple[bytes, LayoutResult]:
"""
Full pipeline: clean handwriting and reconstruct layout.
Args:
image_bytes: Source image
remove_handwriting: Whether to remove handwriting first
Returns:
Tuple of (cleaned image bytes, layout result)
"""
if remove_handwriting:
from services.inpainting_service import remove_handwriting as clean_hw
cleaned_bytes, _ = clean_hw(image_bytes)
else:
cleaned_bytes = image_bytes
layout = reconstruct_layout(cleaned_bytes)
return cleaned_bytes, layout