This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/image_cleaner.py
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

346 lines
12 KiB
Python

"""
Image Cleaning Module - Stage 2 of Worksheet Cleaning System
Removes handwriting and markings from worksheet scans while preserving
printed text and diagrams using computer vision techniques.
"""
import cv2
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional
import logging
logger = logging.getLogger(__name__)
class WorksheetCleaner:
"""
Removes handwriting from worksheet scans while preserving printed content.
Multi-strategy approach:
1. Color-based filtering (blue ink detection)
2. AI-guided region masking (using bounding boxes from analysis)
3. Stroke thickness analysis (thin handwriting vs thick print)
4. Diagram preservation (copy from original)
"""
def __init__(self, debug_mode: bool = False):
"""
Initialize the worksheet cleaner.
Args:
debug_mode: If True, saves intermediate images for debugging
"""
self.debug_mode = debug_mode
# Tunable parameters (optimiert für bessere Handschrift-Entfernung)
self.blue_hue_range = (90, 130) # HSV hue range for blue ink
self.inpaint_radius = 10 # Erhöht von 3 auf 10 für besseres Inpainting
self.min_stroke_thickness = 2
self.handwriting_area_threshold = 50
self.sharpen_amount = 1.5
self.mask_dilation_kernel_size = 5 # Vergrößert Masken um Handschrift vollständig zu erfassen
def clean_worksheet(
self,
image_path: Path,
analysis_data: Dict,
output_path: Path
) -> Path:
"""
Main cleaning pipeline.
Args:
image_path: Path to input worksheet scan
analysis_data: JSON from Stage 1 analysis (with layout/handwriting_regions)
output_path: Where to save cleaned image
Returns:
Path to cleaned image
Raises:
ValueError: If image cannot be loaded
RuntimeError: If cleaning fails
"""
logger.info(f"Starting cleaning for {image_path.name}")
# Load image
img = cv2.imread(str(image_path))
if img is None:
raise ValueError(f"Cannot load image: {image_path}")
original = img.copy()
cleaned = img.copy()
try:
# Strategy 1: Color-based filtering
if self._has_blue_ink_annotations(analysis_data):
logger.info("Applying blue ink removal")
cleaned = self._remove_blue_ink(cleaned)
# Strategy 2: AI-guided region masking
hw_regions = analysis_data.get('handwriting_regions', [])
if hw_regions:
logger.info(f"Masking {len(hw_regions)} handwriting regions")
cleaned = self._mask_handwriting_regions(cleaned, hw_regions)
# Strategy 3: Stroke thickness analysis
logger.info("Removing thin strokes")
cleaned = self._remove_thin_strokes(cleaned, img)
# Post-processing: enhance printed text
logger.info("Enhancing printed text")
cleaned = self._enhance_printed_text(cleaned)
# Preserve diagrams
diagram_elements = analysis_data.get('layout', {}).get('diagram_elements', [])
if diagram_elements:
logger.info(f"Preserving {len(diagram_elements)} diagram elements")
cleaned = self._preserve_diagrams(cleaned, original, diagram_elements)
# Save result
cv2.imwrite(str(output_path), cleaned)
logger.info(f"Cleaned image saved to {output_path.name}")
return output_path
except Exception as e:
logger.error(f"Cleaning failed for {image_path.name}: {e}")
raise RuntimeError(f"Cleaning failed: {e}") from e
def _has_blue_ink_annotations(self, analysis_data: Dict) -> bool:
"""Check if analysis detected blue ink handwriting"""
hw_regions = analysis_data.get('handwriting_regions', [])
return any(r.get('color_hint') == 'blue' for r in hw_regions)
def _remove_blue_ink(self, img: np.ndarray) -> np.ndarray:
"""
Remove blue pen marks (common for student answers).
Strategy: Blue ink has high Blue channel, lower Red/Green.
Convert to HSV, isolate blue hue range, create mask, inpaint.
Args:
img: Input image (BGR)
Returns:
Image with blue ink removed
"""
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# Blue hue range: 90-130 in OpenCV HSV (Hue is 0-180)
lower_blue = np.array([self.blue_hue_range[0], 50, 50])
upper_blue = np.array([self.blue_hue_range[1], 255, 255])
# Create mask for blue pixels
blue_mask = cv2.inRange(hsv, lower_blue, upper_blue)
# Morphological operations to clean up mask
kernel = np.ones((3, 3), np.uint8)
blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_CLOSE, kernel)
blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_OPEN, kernel)
# Inpaint blue regions with surrounding colors
cleaned = cv2.inpaint(img, blue_mask, inpaintRadius=self.inpaint_radius,
flags=cv2.INPAINT_TELEA)
return cleaned
def _mask_handwriting_regions(
self,
img: np.ndarray,
hw_regions: List[Dict]
) -> np.ndarray:
"""
Mask out handwriting regions identified by AI.
Uses bounding boxes from analysis_data to selectively clean areas.
OPTIMIZED: Verwendet starkes Inpainting mit vergrößerten Masken.
Args:
img: Input image
hw_regions: List of handwriting region dicts with bounding_box
Returns:
Image with handwriting regions cleaned
"""
cleaned = img.copy()
img_h, img_w = img.shape[:2]
# Erstelle globale Maske für alle Handschrift-Regionen
mask = np.zeros((img_h, img_w), dtype=np.uint8)
for region in hw_regions:
if region.get('type') in ['student_answer', 'correction', 'note', 'drawing']:
bbox = region.get('bounding_box', {})
x = bbox.get('x', 0)
y = bbox.get('y', 0)
w = bbox.get('width', 0)
h = bbox.get('height', 0)
# Validate and clip bounding box
if w > 0 and h > 0 and x >= 0 and y >= 0:
x = max(0, min(x, img_w - 1))
y = max(0, min(y, img_h - 1))
w = min(w, img_w - x)
h = min(h, img_h - y)
if w > 0 and h > 0:
# Vergrößere Bounding Box um 10 Pixel in jede Richtung
padding = 10
x_pad = max(0, x - padding)
y_pad = max(0, y - padding)
w_pad = min(img_w - x_pad, w + 2 * padding)
h_pad = min(img_h - y_pad, h + 2 * padding)
# Zeichne gefülltes Rechteck in Maske
cv2.rectangle(mask, (x_pad, y_pad), (x_pad + w_pad, y_pad + h_pad), 255, -1)
# Vergrößere Maske mit Morphological Dilation
if self.mask_dilation_kernel_size > 0:
kernel = np.ones((self.mask_dilation_kernel_size, self.mask_dilation_kernel_size), np.uint8)
mask = cv2.dilate(mask, kernel, iterations=2)
# Inpaint mit größerem Radius
if np.any(mask > 0):
cleaned = cv2.inpaint(cleaned, mask, inpaintRadius=self.inpaint_radius,
flags=cv2.INPAINT_TELEA)
logger.info(f"Inpainted {np.sum(mask > 0)} pixels with radius {self.inpaint_radius}")
return cleaned
def _clean_text_region(self, region: np.ndarray) -> np.ndarray:
"""
Clean a specific text region, removing handwriting but keeping print.
Heuristic: Printed text is usually darker and more uniform thickness.
Handwriting varies in pressure, lighter or inconsistent.
Args:
region: Image region to clean
Returns:
Cleaned region
"""
gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)
# Adaptive threshold to separate foreground
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2
)
# Find connected components
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
binary, connectivity=8
)
# Create mask for components to remove (small/thin = handwriting)
mask = np.zeros(binary.shape, dtype=np.uint8)
for i in range(1, num_labels): # Skip background (0)
area = stats[i, cv2.CC_STAT_AREA]
width = stats[i, cv2.CC_STAT_WIDTH]
height = stats[i, cv2.CC_STAT_HEIGHT]
# Heuristic: handwriting is often thinner, smaller area
# This needs tuning based on real samples
if area < self.handwriting_area_threshold or max(width, height) < 10:
mask[labels == i] = 255
# Inpaint masked regions
cleaned = cv2.inpaint(region, mask, inpaintRadius=self.inpaint_radius,
flags=cv2.INPAINT_TELEA)
return cleaned
def _remove_thin_strokes(self, cleaned: np.ndarray, original: np.ndarray) -> np.ndarray:
"""
Remove thin pen strokes (handwriting) while keeping thicker printed text.
Uses morphological operations to identify stroke thickness.
Args:
cleaned: Current cleaned image
original: Original image
Returns:
Image with thin strokes removed
"""
gray_clean = cv2.cvtColor(cleaned, cv2.COLOR_BGR2GRAY)
# Detect edges
edges_clean = cv2.Canny(gray_clean, 50, 150)
# Morphological closing to connect nearby edges
kernel = np.ones((2, 2), np.uint8)
thick_strokes = cv2.morphologyEx(edges_clean, cv2.MORPH_CLOSE, kernel, iterations=2)
# Thin strokes detection (erosion reveals thin lines)
thin_strokes = cv2.erode(edges_clean, kernel, iterations=1)
thin_only = cv2.subtract(thin_strokes, thick_strokes)
# Inpaint thin strokes
result = cv2.inpaint(cleaned, thin_only, inpaintRadius=2, flags=cv2.INPAINT_TELEA)
return result
def _enhance_printed_text(self, img: np.ndarray) -> np.ndarray:
"""
Sharpen and enhance printed text after cleaning.
Uses unsharp masking technique.
Args:
img: Input image
Returns:
Enhanced image
"""
# Unsharp masking
gaussian = cv2.GaussianBlur(img, (0, 0), 2.0)
sharpened = cv2.addWeighted(img, self.sharpen_amount, gaussian, -0.5, 0)
return sharpened
def _preserve_diagrams(
self,
cleaned: np.ndarray,
original: np.ndarray,
diagram_elements: List[Dict]
) -> np.ndarray:
"""
Preserve diagram/illustration areas by copying from original.
Args:
cleaned: Current cleaned image
original: Original scan
diagram_elements: List of diagram bounding boxes from AI analysis
Returns:
Image with diagrams preserved
"""
result = cleaned.copy()
for diagram in diagram_elements:
if diagram.get('preserve', True):
bbox = diagram.get('bounding_box', {})
x = bbox.get('x', 0)
y = bbox.get('y', 0)
w = bbox.get('width', 0)
h = bbox.get('height', 0)
# Validate and clip bounding box
if w > 0 and h > 0 and x >= 0 and y >= 0:
img_h, img_w = original.shape[:2]
x = min(x, img_w - 1)
y = min(y, img_h - 1)
w = min(w, img_w - x)
h = min(h, img_h - y)
if w > 0 and h > 0:
# Copy diagram region from original
result[y:y+h, x:x+w] = original[y:y+h, x:x+w]
return result