feat(klausur): Handschrift entfernen + Klausur-HTR implementiert
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m49s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m49s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 15s
Feature 1: Handschrift entfernen via OCR-Pipeline Session
- services/handwriting_detection.py: _detect_pencil() + target_ink Parameter
("all" | "colored" | "pencil") für gezielte Tinten-Erkennung
- ocr_pipeline_session_store.py: clean_png + handwriting_removal_meta Spalten
(idempotentes ALTER TABLE in init_ocr_pipeline_tables)
- ocr_pipeline_api.py: POST /sessions/{id}/remove-handwriting Endpoint
+ "clean" zu valid_types für Image-Serving hinzugefügt
Feature 2: Klausur-HTR (Hochwertige Handschriftenerkennung)
- handwriting_htr_api.py: Neuer Router /api/v1/htr/recognize + /recognize-session
Primary: qwen2.5vl:32b via Ollama, Fallback: trocr-large-handwritten
- services/trocr_service.py: size Parameter (base | large) für get_trocr_model()
+ run_trocr_ocr() - unterstützt jetzt trocr-large-handwritten
- main.py: HTR Router registriert
Config:
- docker-compose.yml: OLLAMA_HTR_MODEL, HTR_FALLBACK_MODEL
- .env.example: HTR Env-Vars dokumentiert
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,7 @@ Uses multiple detection methods:
|
||||
1. Color-based detection (blue/red ink)
|
||||
2. Stroke analysis (thin irregular strokes)
|
||||
3. Edge density variance
|
||||
4. Pencil detection (gray ink)
|
||||
|
||||
DATENSCHUTZ: All processing happens locally on Mac Mini.
|
||||
"""
|
||||
@@ -37,12 +38,16 @@ class DetectionResult:
|
||||
detection_method: str # Which method was primarily used
|
||||
|
||||
|
||||
def detect_handwriting(image_bytes: bytes) -> DetectionResult:
|
||||
def detect_handwriting(image_bytes: bytes, target_ink: str = "all") -> DetectionResult:
|
||||
"""
|
||||
Detect handwriting in an image.
|
||||
|
||||
Args:
|
||||
image_bytes: Image as bytes (PNG, JPG, etc.)
|
||||
target_ink: Which ink types to detect:
|
||||
- "all" → all methods combined (incl. pencil)
|
||||
- "colored" → only color-based (blue/red/green pen)
|
||||
- "pencil" → only pencil (gray ink)
|
||||
|
||||
Returns:
|
||||
DetectionResult with binary mask where handwriting is white (255)
|
||||
@@ -62,35 +67,51 @@ def detect_handwriting(image_bytes: bytes) -> DetectionResult:
|
||||
|
||||
# Convert to BGR if needed (OpenCV format)
|
||||
if len(img_array.shape) == 2:
|
||||
# Grayscale to BGR
|
||||
img_bgr = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
|
||||
elif img_array.shape[2] == 4:
|
||||
# RGBA to BGR
|
||||
img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
|
||||
elif img_array.shape[2] == 3:
|
||||
# RGB to BGR
|
||||
img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
||||
else:
|
||||
img_bgr = img_array
|
||||
|
||||
# Run multiple detection methods
|
||||
color_mask, color_confidence = _detect_by_color(img_bgr)
|
||||
stroke_mask, stroke_confidence = _detect_by_stroke_analysis(img_bgr)
|
||||
variance_mask, variance_confidence = _detect_by_variance(img_bgr)
|
||||
# Select detection methods based on target_ink
|
||||
masks_and_weights = []
|
||||
|
||||
if target_ink in ("all", "colored"):
|
||||
color_mask, color_conf = _detect_by_color(img_bgr)
|
||||
masks_and_weights.append((color_mask, color_conf, "color"))
|
||||
|
||||
if target_ink == "all":
|
||||
stroke_mask, stroke_conf = _detect_by_stroke_analysis(img_bgr)
|
||||
variance_mask, variance_conf = _detect_by_variance(img_bgr)
|
||||
masks_and_weights.append((stroke_mask, stroke_conf, "stroke"))
|
||||
masks_and_weights.append((variance_mask, variance_conf, "variance"))
|
||||
|
||||
if target_ink in ("all", "pencil"):
|
||||
pencil_mask, pencil_conf = _detect_pencil(img_bgr)
|
||||
masks_and_weights.append((pencil_mask, pencil_conf, "pencil"))
|
||||
|
||||
if not masks_and_weights:
|
||||
# Fallback: use all methods
|
||||
color_mask, color_conf = _detect_by_color(img_bgr)
|
||||
stroke_mask, stroke_conf = _detect_by_stroke_analysis(img_bgr)
|
||||
variance_mask, variance_conf = _detect_by_variance(img_bgr)
|
||||
pencil_mask, pencil_conf = _detect_pencil(img_bgr)
|
||||
masks_and_weights = [
|
||||
(color_mask, color_conf, "color"),
|
||||
(stroke_mask, stroke_conf, "stroke"),
|
||||
(variance_mask, variance_conf, "variance"),
|
||||
(pencil_mask, pencil_conf, "pencil"),
|
||||
]
|
||||
|
||||
# Combine masks using weighted average
|
||||
weights = [color_confidence, stroke_confidence, variance_confidence]
|
||||
total_weight = sum(weights)
|
||||
total_weight = sum(w for _, w, _ in masks_and_weights)
|
||||
|
||||
if total_weight > 0:
|
||||
# Weighted combination
|
||||
combined_mask = (
|
||||
color_mask.astype(np.float32) * color_confidence +
|
||||
stroke_mask.astype(np.float32) * stroke_confidence +
|
||||
variance_mask.astype(np.float32) * variance_confidence
|
||||
combined_mask = sum(
|
||||
m.astype(np.float32) * w for m, w, _ in masks_and_weights
|
||||
) / total_weight
|
||||
|
||||
# Threshold to binary
|
||||
combined_mask = (combined_mask > 127).astype(np.uint8) * 255
|
||||
else:
|
||||
combined_mask = np.zeros(img_bgr.shape[:2], dtype=np.uint8)
|
||||
@@ -103,19 +124,11 @@ def detect_handwriting(image_bytes: bytes) -> DetectionResult:
|
||||
handwriting_pixels = np.sum(combined_mask > 0)
|
||||
handwriting_ratio = handwriting_pixels / total_pixels if total_pixels > 0 else 0
|
||||
|
||||
# Determine primary method
|
||||
primary_method = "combined"
|
||||
max_conf = max(color_confidence, stroke_confidence, variance_confidence)
|
||||
if max_conf == color_confidence:
|
||||
primary_method = "color"
|
||||
elif max_conf == stroke_confidence:
|
||||
primary_method = "stroke"
|
||||
else:
|
||||
primary_method = "variance"
|
||||
# Determine primary method (highest confidence)
|
||||
primary_method = max(masks_and_weights, key=lambda x: x[1])[2] if masks_and_weights else "combined"
|
||||
overall_confidence = total_weight / len(masks_and_weights) if masks_and_weights else 0.0
|
||||
|
||||
overall_confidence = total_weight / 3.0 # Average confidence
|
||||
|
||||
logger.info(f"Handwriting detection: {handwriting_ratio:.2%} handwriting, "
|
||||
logger.info(f"Handwriting detection (target_ink={target_ink}): {handwriting_ratio:.2%} handwriting, "
|
||||
f"confidence={overall_confidence:.2f}, method={primary_method}")
|
||||
|
||||
return DetectionResult(
|
||||
@@ -180,6 +193,27 @@ def _detect_by_color(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
|
||||
return color_mask, confidence
|
||||
|
||||
|
||||
def _detect_pencil(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
|
||||
"""
|
||||
Detect pencil marks (gray ink, ~140-220 on 255-scale).
|
||||
|
||||
Paper is usually >230, dark ink <130.
|
||||
Pencil falls in the 140-220 gray range.
|
||||
"""
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
pencil_mask = cv2.inRange(gray, 140, 220)
|
||||
|
||||
# Remove small noise artifacts
|
||||
kernel = np.ones((2, 2), np.uint8)
|
||||
pencil_mask = cv2.morphologyEx(pencil_mask, cv2.MORPH_OPEN, kernel, iterations=1)
|
||||
|
||||
ratio = np.sum(pencil_mask > 0) / pencil_mask.size
|
||||
# Good confidence if pencil pixels are in a plausible range
|
||||
confidence = 0.75 if 0.002 < ratio < 0.2 else 0.2
|
||||
|
||||
return pencil_mask, confidence
|
||||
|
||||
|
||||
def _detect_by_stroke_analysis(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
|
||||
"""
|
||||
Detect handwriting by analyzing stroke characteristics.
|
||||
|
||||
@@ -31,8 +31,10 @@ from datetime import datetime, timedelta
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Lazy loading for heavy dependencies
|
||||
_trocr_processor = None
|
||||
_trocr_model = None
|
||||
# Cache keyed by model_name to support base and large variants simultaneously
|
||||
_trocr_models: dict = {} # {model_name: (processor, model)}
|
||||
_trocr_processor = None # backwards-compat alias → base-printed
|
||||
_trocr_model = None # backwards-compat alias → base-printed
|
||||
_trocr_available = None
|
||||
_model_loaded_at = None
|
||||
|
||||
@@ -124,12 +126,14 @@ def _check_trocr_available() -> bool:
|
||||
return _trocr_available
|
||||
|
||||
|
||||
def get_trocr_model(handwritten: bool = False):
|
||||
def get_trocr_model(handwritten: bool = False, size: str = "base"):
|
||||
"""
|
||||
Lazy load TrOCR model and processor.
|
||||
|
||||
Args:
|
||||
handwritten: Use handwritten model instead of printed model
|
||||
size: Model size — "base" (300 MB) or "large" (340 MB, higher accuracy
|
||||
for exam HTR). Only applies to handwritten variant.
|
||||
|
||||
Returns tuple of (processor, model) or (None, None) if unavailable.
|
||||
"""
|
||||
@@ -138,31 +142,42 @@ def get_trocr_model(handwritten: bool = False):
|
||||
if not _check_trocr_available():
|
||||
return None, None
|
||||
|
||||
if _trocr_processor is None or _trocr_model is None:
|
||||
try:
|
||||
import torch
|
||||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
||||
# Select model name
|
||||
if size == "large" and handwritten:
|
||||
model_name = "microsoft/trocr-large-handwritten"
|
||||
elif handwritten:
|
||||
model_name = "microsoft/trocr-base-handwritten"
|
||||
else:
|
||||
model_name = "microsoft/trocr-base-printed"
|
||||
|
||||
# Choose model based on use case
|
||||
if handwritten:
|
||||
model_name = "microsoft/trocr-base-handwritten"
|
||||
else:
|
||||
model_name = "microsoft/trocr-base-printed"
|
||||
if model_name in _trocr_models:
|
||||
return _trocr_models[model_name]
|
||||
|
||||
logger.info(f"Loading TrOCR model: {model_name}")
|
||||
_trocr_processor = TrOCRProcessor.from_pretrained(model_name)
|
||||
_trocr_model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
||||
try:
|
||||
import torch
|
||||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
||||
|
||||
# Use GPU if available
|
||||
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
_trocr_model.to(device)
|
||||
logger.info(f"TrOCR model loaded on device: {device}")
|
||||
logger.info(f"Loading TrOCR model: {model_name}")
|
||||
processor = TrOCRProcessor.from_pretrained(model_name)
|
||||
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load TrOCR model: {e}")
|
||||
return None, None
|
||||
# Use GPU if available
|
||||
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
model.to(device)
|
||||
logger.info(f"TrOCR model loaded on device: {device}")
|
||||
|
||||
return _trocr_processor, _trocr_model
|
||||
_trocr_models[model_name] = (processor, model)
|
||||
|
||||
# Keep backwards-compat globals pointing at base-printed
|
||||
if model_name == "microsoft/trocr-base-printed":
|
||||
_trocr_processor = processor
|
||||
_trocr_model = model
|
||||
|
||||
return processor, model
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load TrOCR model {model_name}: {e}")
|
||||
return None, None
|
||||
|
||||
|
||||
def preload_trocr_model(handwritten: bool = True) -> bool:
|
||||
@@ -209,7 +224,8 @@ def get_model_status() -> Dict[str, Any]:
|
||||
async def run_trocr_ocr(
|
||||
image_data: bytes,
|
||||
handwritten: bool = False,
|
||||
split_lines: bool = True
|
||||
split_lines: bool = True,
|
||||
size: str = "base",
|
||||
) -> Tuple[Optional[str], float]:
|
||||
"""
|
||||
Run TrOCR on an image.
|
||||
@@ -223,11 +239,12 @@ async def run_trocr_ocr(
|
||||
image_data: Raw image bytes
|
||||
handwritten: Use handwritten model (slower but better for handwriting)
|
||||
split_lines: Whether to split image into lines first
|
||||
size: "base" or "large" (only for handwritten variant)
|
||||
|
||||
Returns:
|
||||
Tuple of (extracted_text, confidence)
|
||||
"""
|
||||
processor, model = get_trocr_model(handwritten=handwritten)
|
||||
processor, model = get_trocr_model(handwritten=handwritten, size=size)
|
||||
|
||||
if processor is None or model is None:
|
||||
logger.error("TrOCR model not available")
|
||||
|
||||
Reference in New Issue
Block a user