[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions

View File

@@ -15,60 +15,24 @@ Verwendet:
"""
import logging
import os
import io
import base64
from pathlib import Path
from typing import Optional, List, Dict, Any, Tuple, Union
from dataclasses import dataclass
from enum import Enum
from typing import Optional, List, Dict, Any
import cv2
import numpy as np
from PIL import Image
from .file_processor_models import (
FileType,
ProcessingMode,
ProcessedRegion,
ProcessingResult,
)
logger = logging.getLogger(__name__)
class FileType(str, Enum):
"""Unterstützte Dateitypen."""
PDF = "pdf"
IMAGE = "image"
DOCX = "docx"
DOC = "doc"
TXT = "txt"
UNKNOWN = "unknown"
class ProcessingMode(str, Enum):
"""Verarbeitungsmodi."""
OCR_HANDWRITING = "ocr_handwriting" # Handschrifterkennung
OCR_PRINTED = "ocr_printed" # Gedruckter Text
TEXT_EXTRACT = "text_extract" # Textextraktion (PDF/DOCX)
MIXED = "mixed" # Kombiniert OCR + Textextraktion
@dataclass
class ProcessedRegion:
"""Ein erkannter Textbereich."""
text: str
confidence: float
bbox: Tuple[int, int, int, int] # x1, y1, x2, y2
page: int = 1
@dataclass
class ProcessingResult:
"""Ergebnis der Dokumentenverarbeitung."""
text: str
confidence: float
regions: List[ProcessedRegion]
page_count: int
file_type: FileType
processing_mode: ProcessingMode
metadata: Dict[str, Any]
class FileProcessor:
"""
Zentrale Dokumentenverarbeitung für BreakPilot.
@@ -81,17 +45,9 @@ class FileProcessor:
"""
def __init__(self, ocr_lang: str = "de", use_gpu: bool = False):
"""
Initialisiert den File Processor.
Args:
ocr_lang: Sprache für OCR (default: "de" für Deutsch)
use_gpu: GPU für OCR nutzen (beschleunigt Verarbeitung)
"""
self.ocr_lang = ocr_lang
self.use_gpu = use_gpu
self._ocr_engine = None
logger.info(f"FileProcessor initialized (lang={ocr_lang}, gpu={use_gpu})")
@property
@@ -107,7 +63,7 @@ class FileProcessor:
from paddleocr import PaddleOCR
return PaddleOCR(
use_angle_cls=True,
lang='german', # Deutsch
lang='german',
use_gpu=self.use_gpu,
show_log=False
)
@@ -116,16 +72,7 @@ class FileProcessor:
return None
def detect_file_type(self, file_path: str = None, file_bytes: bytes = None) -> FileType:
"""
Erkennt den Dateityp.
Args:
file_path: Pfad zur Datei
file_bytes: Dateiinhalt als Bytes
Returns:
FileType enum
"""
"""Erkennt den Dateityp."""
if file_path:
ext = Path(file_path).suffix.lower()
if ext == ".pdf":
@@ -140,14 +87,13 @@ class FileProcessor:
return FileType.TXT
if file_bytes:
# Magic number detection
if file_bytes[:4] == b'%PDF':
return FileType.PDF
elif file_bytes[:8] == b'\x89PNG\r\n\x1a\n':
return FileType.IMAGE
elif file_bytes[:2] in [b'\xff\xd8', b'BM']: # JPEG, BMP
elif file_bytes[:2] in [b'\xff\xd8', b'BM']:
return FileType.IMAGE
elif file_bytes[:4] == b'PK\x03\x04': # ZIP (DOCX)
elif file_bytes[:4] == b'PK\x03\x04':
return FileType.DOCX
return FileType.UNKNOWN
@@ -158,17 +104,7 @@ class FileProcessor:
file_bytes: bytes = None,
mode: ProcessingMode = ProcessingMode.MIXED
) -> ProcessingResult:
"""
Verarbeitet ein Dokument.
Args:
file_path: Pfad zur Datei
file_bytes: Dateiinhalt als Bytes
mode: Verarbeitungsmodus
Returns:
ProcessingResult mit extrahiertem Text und Metadaten
"""
"""Verarbeitet ein Dokument."""
if not file_path and not file_bytes:
raise ValueError("Entweder file_path oder file_bytes muss angegeben werden")
@@ -186,18 +122,12 @@ class FileProcessor:
else:
raise ValueError(f"Nicht unterstützter Dateityp: {file_type}")
def _process_pdf(
self,
file_path: str = None,
file_bytes: bytes = None,
mode: ProcessingMode = ProcessingMode.MIXED
) -> ProcessingResult:
def _process_pdf(self, file_path=None, file_bytes=None, mode=ProcessingMode.MIXED):
"""Verarbeitet PDF-Dateien."""
try:
import fitz # PyMuPDF
import fitz
except ImportError:
logger.warning("PyMuPDF nicht installiert - versuche Fallback")
# Fallback: PDF als Bild behandeln
return self._process_image(file_path, file_bytes, mode)
if file_bytes:
@@ -205,35 +135,27 @@ class FileProcessor:
else:
doc = fitz.open(file_path)
all_text = []
all_regions = []
total_confidence = 0.0
region_count = 0
all_text, all_regions = [], []
total_confidence, region_count = 0.0, 0
for page_num, page in enumerate(doc, start=1):
# Erst versuchen Text direkt zu extrahieren
page_text = page.get_text()
if page_text.strip() and mode != ProcessingMode.OCR_HANDWRITING:
# PDF enthält Text (nicht nur Bilder)
all_text.append(page_text)
all_regions.append(ProcessedRegion(
text=page_text,
confidence=1.0,
text=page_text, confidence=1.0,
bbox=(0, 0, int(page.rect.width), int(page.rect.height)),
page=page_num
))
total_confidence += 1.0
region_count += 1
else:
# Seite als Bild rendern und OCR anwenden
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x Auflösung
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img_bytes = pix.tobytes("png")
img = Image.open(io.BytesIO(img_bytes))
ocr_result = self._ocr_image(img)
all_text.append(ocr_result["text"])
for region in ocr_result["regions"]:
region.page = page_num
all_regions.append(region)
@@ -241,55 +163,34 @@ class FileProcessor:
region_count += 1
doc.close()
avg_confidence = total_confidence / region_count if region_count > 0 else 0.0
return ProcessingResult(
text="\n\n".join(all_text),
confidence=avg_confidence,
text="\n\n".join(all_text), confidence=avg_confidence,
regions=all_regions,
page_count=len(doc) if hasattr(doc, '__len__') else 1,
file_type=FileType.PDF,
processing_mode=mode,
file_type=FileType.PDF, processing_mode=mode,
metadata={"source": file_path or "bytes"}
)
def _process_image(
self,
file_path: str = None,
file_bytes: bytes = None,
mode: ProcessingMode = ProcessingMode.MIXED
) -> ProcessingResult:
def _process_image(self, file_path=None, file_bytes=None, mode=ProcessingMode.MIXED):
"""Verarbeitet Bilddateien."""
if file_bytes:
img = Image.open(io.BytesIO(file_bytes))
else:
img = Image.open(file_path)
# Bildvorverarbeitung
processed_img = self._preprocess_image(img)
# OCR
ocr_result = self._ocr_image(processed_img)
return ProcessingResult(
text=ocr_result["text"],
confidence=ocr_result["confidence"],
regions=ocr_result["regions"],
page_count=1,
file_type=FileType.IMAGE,
processing_mode=mode,
metadata={
"source": file_path or "bytes",
"image_size": img.size
}
text=ocr_result["text"], confidence=ocr_result["confidence"],
regions=ocr_result["regions"], page_count=1,
file_type=FileType.IMAGE, processing_mode=mode,
metadata={"source": file_path or "bytes", "image_size": img.size}
)
def _process_docx(
self,
file_path: str = None,
file_bytes: bytes = None
) -> ProcessingResult:
def _process_docx(self, file_path=None, file_bytes=None):
"""Verarbeitet DOCX-Dateien."""
try:
from docx import Document
@@ -306,7 +207,6 @@ class FileProcessor:
if para.text.strip():
paragraphs.append(para.text)
# Auch Tabellen extrahieren
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text for cell in row.cells)
@@ -316,25 +216,14 @@ class FileProcessor:
text = "\n\n".join(paragraphs)
return ProcessingResult(
text=text,
confidence=1.0, # Direkte Textextraktion
regions=[ProcessedRegion(
text=text,
confidence=1.0,
bbox=(0, 0, 0, 0),
page=1
)],
page_count=1,
file_type=FileType.DOCX,
text=text, confidence=1.0,
regions=[ProcessedRegion(text=text, confidence=1.0, bbox=(0, 0, 0, 0), page=1)],
page_count=1, file_type=FileType.DOCX,
processing_mode=ProcessingMode.TEXT_EXTRACT,
metadata={"source": file_path or "bytes"}
)
def _process_txt(
self,
file_path: str = None,
file_bytes: bytes = None
) -> ProcessingResult:
def _process_txt(self, file_path=None, file_bytes=None):
"""Verarbeitet Textdateien."""
if file_bytes:
text = file_bytes.decode('utf-8', errors='ignore')
@@ -343,146 +232,65 @@ class FileProcessor:
text = f.read()
return ProcessingResult(
text=text,
confidence=1.0,
regions=[ProcessedRegion(
text=text,
confidence=1.0,
bbox=(0, 0, 0, 0),
page=1
)],
page_count=1,
file_type=FileType.TXT,
text=text, confidence=1.0,
regions=[ProcessedRegion(text=text, confidence=1.0, bbox=(0, 0, 0, 0), page=1)],
page_count=1, file_type=FileType.TXT,
processing_mode=ProcessingMode.TEXT_EXTRACT,
metadata={"source": file_path or "bytes"}
)
def _preprocess_image(self, img: Image.Image) -> Image.Image:
"""
Vorverarbeitung des Bildes für bessere OCR-Ergebnisse.
- Konvertierung zu Graustufen
- Kontrastverstärkung
- Rauschunterdrückung
- Binarisierung
"""
# PIL zu OpenCV
"""Vorverarbeitung des Bildes für bessere OCR-Ergebnisse."""
cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
# Zu Graustufen konvertieren
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
# Rauschunterdrückung
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# Kontrastverstärkung (CLAHE)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)
# Adaptive Binarisierung
binary = cv2.adaptiveThreshold(
enhanced,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
11,
2
enhanced, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2
)
# Zurück zu PIL
return Image.fromarray(binary)
def _ocr_image(self, img: Image.Image) -> Dict[str, Any]:
"""
Führt OCR auf einem Bild aus.
Returns:
Dict mit text, confidence und regions
"""
"""Führt OCR auf einem Bild aus."""
if self.ocr_engine is None:
# Fallback wenn kein OCR-Engine verfügbar
return {
"text": "[OCR nicht verfügbar - bitte PaddleOCR installieren]",
"confidence": 0.0,
"regions": []
}
return {"text": "[OCR nicht verfügbar - bitte PaddleOCR installieren]",
"confidence": 0.0, "regions": []}
# PIL zu numpy array
img_array = np.array(img)
# Wenn Graustufen, zu RGB konvertieren (PaddleOCR erwartet RGB)
if len(img_array.shape) == 2:
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
# OCR ausführen
result = self.ocr_engine.ocr(img_array, cls=True)
if not result or not result[0]:
return {"text": "", "confidence": 0.0, "regions": []}
all_text = []
all_regions = []
all_text, all_regions = [], []
total_confidence = 0.0
for line in result[0]:
bbox_points = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
bbox_points = line[0]
text, confidence = line[1]
# Bounding Box zu x1, y1, x2, y2 konvertieren
x_coords = [p[0] for p in bbox_points]
y_coords = [p[1] for p in bbox_points]
bbox = (
int(min(x_coords)),
int(min(y_coords)),
int(max(x_coords)),
int(max(y_coords))
)
bbox = (int(min(x_coords)), int(min(y_coords)),
int(max(x_coords)), int(max(y_coords)))
all_text.append(text)
all_regions.append(ProcessedRegion(
text=text,
confidence=confidence,
bbox=bbox
))
all_regions.append(ProcessedRegion(text=text, confidence=confidence, bbox=bbox))
total_confidence += confidence
avg_confidence = total_confidence / len(all_regions) if all_regions else 0.0
return {"text": "\n".join(all_text), "confidence": avg_confidence, "regions": all_regions}
return {
"text": "\n".join(all_text),
"confidence": avg_confidence,
"regions": all_regions
}
def extract_handwriting_regions(
self,
img: Image.Image,
min_area: int = 500
) -> List[Dict[str, Any]]:
"""
Erkennt und extrahiert handschriftliche Bereiche aus einem Bild.
Nützlich für Klausuren mit gedruckten Fragen und handschriftlichen Antworten.
Args:
img: Eingabebild
min_area: Minimale Fläche für erkannte Regionen
Returns:
Liste von Regionen mit Koordinaten und erkanntem Text
"""
# Bildvorverarbeitung
def extract_handwriting_regions(self, img: Image.Image, min_area: int = 500) -> List[Dict[str, Any]]:
"""Erkennt und extrahiert handschriftliche Bereiche aus einem Bild."""
cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
# Kanten erkennen
edges = cv2.Canny(gray, 50, 150)
# Morphologische Operationen zum Verbinden
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
dilated = cv2.dilate(edges, kernel, iterations=2)
# Konturen finden
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
regions = []
@@ -490,25 +298,15 @@ class FileProcessor:
area = cv2.contourArea(contour)
if area < min_area:
continue
x, y, w, h = cv2.boundingRect(contour)
# Region ausschneiden
region_img = img.crop((x, y, x + w, y + h))
# OCR auf Region anwenden
ocr_result = self._ocr_image(region_img)
regions.append({
"bbox": (x, y, x + w, y + h),
"area": area,
"text": ocr_result["text"],
"confidence": ocr_result["confidence"]
"bbox": (x, y, x + w, y + h), "area": area,
"text": ocr_result["text"], "confidence": ocr_result["confidence"]
})
# Nach Y-Position sortieren (oben nach unten)
regions.sort(key=lambda r: r["bbox"][1])
return regions
@@ -525,39 +323,25 @@ def get_file_processor() -> FileProcessor:
# Convenience functions
def process_file(
file_path: str = None,
file_bytes: bytes = None,
mode: ProcessingMode = ProcessingMode.MIXED
) -> ProcessingResult:
"""
Convenience function zum Verarbeiten einer Datei.
Args:
file_path: Pfad zur Datei
file_bytes: Dateiinhalt als Bytes
mode: Verarbeitungsmodus
Returns:
ProcessingResult
"""
def process_file(file_path=None, file_bytes=None, mode=ProcessingMode.MIXED) -> ProcessingResult:
"""Convenience function zum Verarbeiten einer Datei."""
processor = get_file_processor()
return processor.process(file_path, file_bytes, mode)
def extract_text_from_pdf(file_path: str = None, file_bytes: bytes = None) -> str:
def extract_text_from_pdf(file_path=None, file_bytes=None) -> str:
"""Extrahiert Text aus einer PDF-Datei."""
result = process_file(file_path, file_bytes, ProcessingMode.TEXT_EXTRACT)
return result.text
def ocr_image(file_path: str = None, file_bytes: bytes = None) -> str:
def ocr_image(file_path=None, file_bytes=None) -> str:
"""Führt OCR auf einem Bild aus."""
result = process_file(file_path, file_bytes, ProcessingMode.OCR_PRINTED)
return result.text
def ocr_handwriting(file_path: str = None, file_bytes: bytes = None) -> str:
def ocr_handwriting(file_path=None, file_bytes=None) -> str:
"""Führt Handschrift-OCR auf einem Bild aus."""
result = process_file(file_path, file_bytes, ProcessingMode.OCR_HANDWRITING)
return result.text

View File

@@ -0,0 +1,48 @@
"""
File Processor - Datenmodelle und Enums.
Typen fuer Dokumentenverarbeitung: Dateitypen, Modi, Ergebnisse.
"""
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass
from enum import Enum
class FileType(str, Enum):
"""Unterstützte Dateitypen."""
PDF = "pdf"
IMAGE = "image"
DOCX = "docx"
DOC = "doc"
TXT = "txt"
UNKNOWN = "unknown"
class ProcessingMode(str, Enum):
"""Verarbeitungsmodi."""
OCR_HANDWRITING = "ocr_handwriting" # Handschrifterkennung
OCR_PRINTED = "ocr_printed" # Gedruckter Text
TEXT_EXTRACT = "text_extract" # Textextraktion (PDF/DOCX)
MIXED = "mixed" # Kombiniert OCR + Textextraktion
@dataclass
class ProcessedRegion:
"""Ein erkannter Textbereich."""
text: str
confidence: float
bbox: Tuple[int, int, int, int] # x1, y1, x2, y2
page: int = 1
@dataclass
class ProcessingResult:
"""Ergebnis der Dokumentenverarbeitung."""
text: str
confidence: float
regions: List[ProcessedRegion]
page_count: int
file_type: FileType
processing_mode: ProcessingMode
metadata: Dict[str, Any]