A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
564 lines
17 KiB
Python
564 lines
17 KiB
Python
"""
|
|
File Processor Service - Dokumentenverarbeitung für BreakPilot.
|
|
|
|
Shared Service für:
|
|
- OCR (Optical Character Recognition) für Handschrift und gedruckten Text
|
|
- PDF-Parsing und Textextraktion
|
|
- Bildverarbeitung und -optimierung
|
|
- DOCX/DOC Textextraktion
|
|
|
|
Verwendet:
|
|
- PaddleOCR für deutsche Handschrift
|
|
- PyMuPDF für PDF-Verarbeitung
|
|
- python-docx für DOCX-Dateien
|
|
- OpenCV für Bildvorverarbeitung
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import io
|
|
import base64
|
|
from pathlib import Path
|
|
from typing import Optional, List, Dict, Any, Tuple, Union
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class FileType(str, Enum):
|
|
"""Unterstützte Dateitypen."""
|
|
PDF = "pdf"
|
|
IMAGE = "image"
|
|
DOCX = "docx"
|
|
DOC = "doc"
|
|
TXT = "txt"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
class ProcessingMode(str, Enum):
|
|
"""Verarbeitungsmodi."""
|
|
OCR_HANDWRITING = "ocr_handwriting" # Handschrifterkennung
|
|
OCR_PRINTED = "ocr_printed" # Gedruckter Text
|
|
TEXT_EXTRACT = "text_extract" # Textextraktion (PDF/DOCX)
|
|
MIXED = "mixed" # Kombiniert OCR + Textextraktion
|
|
|
|
|
|
@dataclass
|
|
class ProcessedRegion:
|
|
"""Ein erkannter Textbereich."""
|
|
text: str
|
|
confidence: float
|
|
bbox: Tuple[int, int, int, int] # x1, y1, x2, y2
|
|
page: int = 1
|
|
|
|
|
|
@dataclass
|
|
class ProcessingResult:
|
|
"""Ergebnis der Dokumentenverarbeitung."""
|
|
text: str
|
|
confidence: float
|
|
regions: List[ProcessedRegion]
|
|
page_count: int
|
|
file_type: FileType
|
|
processing_mode: ProcessingMode
|
|
metadata: Dict[str, Any]
|
|
|
|
|
|
class FileProcessor:
|
|
"""
|
|
Zentrale Dokumentenverarbeitung für BreakPilot.
|
|
|
|
Unterstützt:
|
|
- Handschrifterkennung (OCR) für Klausuren
|
|
- Textextraktion aus PDFs
|
|
- DOCX/DOC Verarbeitung
|
|
- Bildvorverarbeitung für bessere OCR-Ergebnisse
|
|
"""
|
|
|
|
def __init__(self, ocr_lang: str = "de", use_gpu: bool = False):
|
|
"""
|
|
Initialisiert den File Processor.
|
|
|
|
Args:
|
|
ocr_lang: Sprache für OCR (default: "de" für Deutsch)
|
|
use_gpu: GPU für OCR nutzen (beschleunigt Verarbeitung)
|
|
"""
|
|
self.ocr_lang = ocr_lang
|
|
self.use_gpu = use_gpu
|
|
self._ocr_engine = None
|
|
|
|
logger.info(f"FileProcessor initialized (lang={ocr_lang}, gpu={use_gpu})")
|
|
|
|
@property
|
|
def ocr_engine(self):
|
|
"""Lazy-Loading des OCR-Engines."""
|
|
if self._ocr_engine is None:
|
|
self._ocr_engine = self._init_ocr_engine()
|
|
return self._ocr_engine
|
|
|
|
def _init_ocr_engine(self):
|
|
"""Initialisiert PaddleOCR oder Fallback."""
|
|
try:
|
|
from paddleocr import PaddleOCR
|
|
return PaddleOCR(
|
|
use_angle_cls=True,
|
|
lang='german', # Deutsch
|
|
use_gpu=self.use_gpu,
|
|
show_log=False
|
|
)
|
|
except ImportError:
|
|
logger.warning("PaddleOCR nicht installiert - verwende Fallback")
|
|
return None
|
|
|
|
def detect_file_type(self, file_path: str = None, file_bytes: bytes = None) -> FileType:
|
|
"""
|
|
Erkennt den Dateityp.
|
|
|
|
Args:
|
|
file_path: Pfad zur Datei
|
|
file_bytes: Dateiinhalt als Bytes
|
|
|
|
Returns:
|
|
FileType enum
|
|
"""
|
|
if file_path:
|
|
ext = Path(file_path).suffix.lower()
|
|
if ext == ".pdf":
|
|
return FileType.PDF
|
|
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".gif"]:
|
|
return FileType.IMAGE
|
|
elif ext == ".docx":
|
|
return FileType.DOCX
|
|
elif ext == ".doc":
|
|
return FileType.DOC
|
|
elif ext == ".txt":
|
|
return FileType.TXT
|
|
|
|
if file_bytes:
|
|
# Magic number detection
|
|
if file_bytes[:4] == b'%PDF':
|
|
return FileType.PDF
|
|
elif file_bytes[:8] == b'\x89PNG\r\n\x1a\n':
|
|
return FileType.IMAGE
|
|
elif file_bytes[:2] in [b'\xff\xd8', b'BM']: # JPEG, BMP
|
|
return FileType.IMAGE
|
|
elif file_bytes[:4] == b'PK\x03\x04': # ZIP (DOCX)
|
|
return FileType.DOCX
|
|
|
|
return FileType.UNKNOWN
|
|
|
|
def process(
|
|
self,
|
|
file_path: str = None,
|
|
file_bytes: bytes = None,
|
|
mode: ProcessingMode = ProcessingMode.MIXED
|
|
) -> ProcessingResult:
|
|
"""
|
|
Verarbeitet ein Dokument.
|
|
|
|
Args:
|
|
file_path: Pfad zur Datei
|
|
file_bytes: Dateiinhalt als Bytes
|
|
mode: Verarbeitungsmodus
|
|
|
|
Returns:
|
|
ProcessingResult mit extrahiertem Text und Metadaten
|
|
"""
|
|
if not file_path and not file_bytes:
|
|
raise ValueError("Entweder file_path oder file_bytes muss angegeben werden")
|
|
|
|
file_type = self.detect_file_type(file_path, file_bytes)
|
|
logger.info(f"Processing file of type: {file_type}")
|
|
|
|
if file_type == FileType.PDF:
|
|
return self._process_pdf(file_path, file_bytes, mode)
|
|
elif file_type == FileType.IMAGE:
|
|
return self._process_image(file_path, file_bytes, mode)
|
|
elif file_type == FileType.DOCX:
|
|
return self._process_docx(file_path, file_bytes)
|
|
elif file_type == FileType.TXT:
|
|
return self._process_txt(file_path, file_bytes)
|
|
else:
|
|
raise ValueError(f"Nicht unterstützter Dateityp: {file_type}")
|
|
|
|
def _process_pdf(
|
|
self,
|
|
file_path: str = None,
|
|
file_bytes: bytes = None,
|
|
mode: ProcessingMode = ProcessingMode.MIXED
|
|
) -> ProcessingResult:
|
|
"""Verarbeitet PDF-Dateien."""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
logger.warning("PyMuPDF nicht installiert - versuche Fallback")
|
|
# Fallback: PDF als Bild behandeln
|
|
return self._process_image(file_path, file_bytes, mode)
|
|
|
|
if file_bytes:
|
|
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
|
else:
|
|
doc = fitz.open(file_path)
|
|
|
|
all_text = []
|
|
all_regions = []
|
|
total_confidence = 0.0
|
|
region_count = 0
|
|
|
|
for page_num, page in enumerate(doc, start=1):
|
|
# Erst versuchen Text direkt zu extrahieren
|
|
page_text = page.get_text()
|
|
|
|
if page_text.strip() and mode != ProcessingMode.OCR_HANDWRITING:
|
|
# PDF enthält Text (nicht nur Bilder)
|
|
all_text.append(page_text)
|
|
all_regions.append(ProcessedRegion(
|
|
text=page_text,
|
|
confidence=1.0,
|
|
bbox=(0, 0, int(page.rect.width), int(page.rect.height)),
|
|
page=page_num
|
|
))
|
|
total_confidence += 1.0
|
|
region_count += 1
|
|
else:
|
|
# Seite als Bild rendern und OCR anwenden
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x Auflösung
|
|
img_bytes = pix.tobytes("png")
|
|
img = Image.open(io.BytesIO(img_bytes))
|
|
|
|
ocr_result = self._ocr_image(img)
|
|
all_text.append(ocr_result["text"])
|
|
|
|
for region in ocr_result["regions"]:
|
|
region.page = page_num
|
|
all_regions.append(region)
|
|
total_confidence += region.confidence
|
|
region_count += 1
|
|
|
|
doc.close()
|
|
|
|
avg_confidence = total_confidence / region_count if region_count > 0 else 0.0
|
|
|
|
return ProcessingResult(
|
|
text="\n\n".join(all_text),
|
|
confidence=avg_confidence,
|
|
regions=all_regions,
|
|
page_count=len(doc) if hasattr(doc, '__len__') else 1,
|
|
file_type=FileType.PDF,
|
|
processing_mode=mode,
|
|
metadata={"source": file_path or "bytes"}
|
|
)
|
|
|
|
def _process_image(
|
|
self,
|
|
file_path: str = None,
|
|
file_bytes: bytes = None,
|
|
mode: ProcessingMode = ProcessingMode.MIXED
|
|
) -> ProcessingResult:
|
|
"""Verarbeitet Bilddateien."""
|
|
if file_bytes:
|
|
img = Image.open(io.BytesIO(file_bytes))
|
|
else:
|
|
img = Image.open(file_path)
|
|
|
|
# Bildvorverarbeitung
|
|
processed_img = self._preprocess_image(img)
|
|
|
|
# OCR
|
|
ocr_result = self._ocr_image(processed_img)
|
|
|
|
return ProcessingResult(
|
|
text=ocr_result["text"],
|
|
confidence=ocr_result["confidence"],
|
|
regions=ocr_result["regions"],
|
|
page_count=1,
|
|
file_type=FileType.IMAGE,
|
|
processing_mode=mode,
|
|
metadata={
|
|
"source": file_path or "bytes",
|
|
"image_size": img.size
|
|
}
|
|
)
|
|
|
|
def _process_docx(
|
|
self,
|
|
file_path: str = None,
|
|
file_bytes: bytes = None
|
|
) -> ProcessingResult:
|
|
"""Verarbeitet DOCX-Dateien."""
|
|
try:
|
|
from docx import Document
|
|
except ImportError:
|
|
raise ImportError("python-docx ist nicht installiert")
|
|
|
|
if file_bytes:
|
|
doc = Document(io.BytesIO(file_bytes))
|
|
else:
|
|
doc = Document(file_path)
|
|
|
|
paragraphs = []
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
paragraphs.append(para.text)
|
|
|
|
# Auch Tabellen extrahieren
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = " | ".join(cell.text for cell in row.cells)
|
|
if row_text.strip():
|
|
paragraphs.append(row_text)
|
|
|
|
text = "\n\n".join(paragraphs)
|
|
|
|
return ProcessingResult(
|
|
text=text,
|
|
confidence=1.0, # Direkte Textextraktion
|
|
regions=[ProcessedRegion(
|
|
text=text,
|
|
confidence=1.0,
|
|
bbox=(0, 0, 0, 0),
|
|
page=1
|
|
)],
|
|
page_count=1,
|
|
file_type=FileType.DOCX,
|
|
processing_mode=ProcessingMode.TEXT_EXTRACT,
|
|
metadata={"source": file_path or "bytes"}
|
|
)
|
|
|
|
def _process_txt(
|
|
self,
|
|
file_path: str = None,
|
|
file_bytes: bytes = None
|
|
) -> ProcessingResult:
|
|
"""Verarbeitet Textdateien."""
|
|
if file_bytes:
|
|
text = file_bytes.decode('utf-8', errors='ignore')
|
|
else:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
text = f.read()
|
|
|
|
return ProcessingResult(
|
|
text=text,
|
|
confidence=1.0,
|
|
regions=[ProcessedRegion(
|
|
text=text,
|
|
confidence=1.0,
|
|
bbox=(0, 0, 0, 0),
|
|
page=1
|
|
)],
|
|
page_count=1,
|
|
file_type=FileType.TXT,
|
|
processing_mode=ProcessingMode.TEXT_EXTRACT,
|
|
metadata={"source": file_path or "bytes"}
|
|
)
|
|
|
|
def _preprocess_image(self, img: Image.Image) -> Image.Image:
|
|
"""
|
|
Vorverarbeitung des Bildes für bessere OCR-Ergebnisse.
|
|
|
|
- Konvertierung zu Graustufen
|
|
- Kontrastverstärkung
|
|
- Rauschunterdrückung
|
|
- Binarisierung
|
|
"""
|
|
# PIL zu OpenCV
|
|
cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
|
|
|
# Zu Graustufen konvertieren
|
|
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Rauschunterdrückung
|
|
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
|
|
|
# Kontrastverstärkung (CLAHE)
|
|
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
|
enhanced = clahe.apply(denoised)
|
|
|
|
# Adaptive Binarisierung
|
|
binary = cv2.adaptiveThreshold(
|
|
enhanced,
|
|
255,
|
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
cv2.THRESH_BINARY,
|
|
11,
|
|
2
|
|
)
|
|
|
|
# Zurück zu PIL
|
|
return Image.fromarray(binary)
|
|
|
|
def _ocr_image(self, img: Image.Image) -> Dict[str, Any]:
|
|
"""
|
|
Führt OCR auf einem Bild aus.
|
|
|
|
Returns:
|
|
Dict mit text, confidence und regions
|
|
"""
|
|
if self.ocr_engine is None:
|
|
# Fallback wenn kein OCR-Engine verfügbar
|
|
return {
|
|
"text": "[OCR nicht verfügbar - bitte PaddleOCR installieren]",
|
|
"confidence": 0.0,
|
|
"regions": []
|
|
}
|
|
|
|
# PIL zu numpy array
|
|
img_array = np.array(img)
|
|
|
|
# Wenn Graustufen, zu RGB konvertieren (PaddleOCR erwartet RGB)
|
|
if len(img_array.shape) == 2:
|
|
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
|
|
|
# OCR ausführen
|
|
result = self.ocr_engine.ocr(img_array, cls=True)
|
|
|
|
if not result or not result[0]:
|
|
return {"text": "", "confidence": 0.0, "regions": []}
|
|
|
|
all_text = []
|
|
all_regions = []
|
|
total_confidence = 0.0
|
|
|
|
for line in result[0]:
|
|
bbox_points = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
text, confidence = line[1]
|
|
|
|
# Bounding Box zu x1, y1, x2, y2 konvertieren
|
|
x_coords = [p[0] for p in bbox_points]
|
|
y_coords = [p[1] for p in bbox_points]
|
|
bbox = (
|
|
int(min(x_coords)),
|
|
int(min(y_coords)),
|
|
int(max(x_coords)),
|
|
int(max(y_coords))
|
|
)
|
|
|
|
all_text.append(text)
|
|
all_regions.append(ProcessedRegion(
|
|
text=text,
|
|
confidence=confidence,
|
|
bbox=bbox
|
|
))
|
|
total_confidence += confidence
|
|
|
|
avg_confidence = total_confidence / len(all_regions) if all_regions else 0.0
|
|
|
|
return {
|
|
"text": "\n".join(all_text),
|
|
"confidence": avg_confidence,
|
|
"regions": all_regions
|
|
}
|
|
|
|
def extract_handwriting_regions(
|
|
self,
|
|
img: Image.Image,
|
|
min_area: int = 500
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Erkennt und extrahiert handschriftliche Bereiche aus einem Bild.
|
|
|
|
Nützlich für Klausuren mit gedruckten Fragen und handschriftlichen Antworten.
|
|
|
|
Args:
|
|
img: Eingabebild
|
|
min_area: Minimale Fläche für erkannte Regionen
|
|
|
|
Returns:
|
|
Liste von Regionen mit Koordinaten und erkanntem Text
|
|
"""
|
|
# Bildvorverarbeitung
|
|
cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
|
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Kanten erkennen
|
|
edges = cv2.Canny(gray, 50, 150)
|
|
|
|
# Morphologische Operationen zum Verbinden
|
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
|
|
dilated = cv2.dilate(edges, kernel, iterations=2)
|
|
|
|
# Konturen finden
|
|
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
regions = []
|
|
for contour in contours:
|
|
area = cv2.contourArea(contour)
|
|
if area < min_area:
|
|
continue
|
|
|
|
x, y, w, h = cv2.boundingRect(contour)
|
|
|
|
# Region ausschneiden
|
|
region_img = img.crop((x, y, x + w, y + h))
|
|
|
|
# OCR auf Region anwenden
|
|
ocr_result = self._ocr_image(region_img)
|
|
|
|
regions.append({
|
|
"bbox": (x, y, x + w, y + h),
|
|
"area": area,
|
|
"text": ocr_result["text"],
|
|
"confidence": ocr_result["confidence"]
|
|
})
|
|
|
|
# Nach Y-Position sortieren (oben nach unten)
|
|
regions.sort(key=lambda r: r["bbox"][1])
|
|
|
|
return regions
|
|
|
|
|
|
# Singleton-Instanz
|
|
_file_processor: Optional[FileProcessor] = None
|
|
|
|
|
|
def get_file_processor() -> FileProcessor:
|
|
"""Gibt Singleton-Instanz des File Processors zurück."""
|
|
global _file_processor
|
|
if _file_processor is None:
|
|
_file_processor = FileProcessor()
|
|
return _file_processor
|
|
|
|
|
|
# Convenience functions
|
|
def process_file(
|
|
file_path: str = None,
|
|
file_bytes: bytes = None,
|
|
mode: ProcessingMode = ProcessingMode.MIXED
|
|
) -> ProcessingResult:
|
|
"""
|
|
Convenience function zum Verarbeiten einer Datei.
|
|
|
|
Args:
|
|
file_path: Pfad zur Datei
|
|
file_bytes: Dateiinhalt als Bytes
|
|
mode: Verarbeitungsmodus
|
|
|
|
Returns:
|
|
ProcessingResult
|
|
"""
|
|
processor = get_file_processor()
|
|
return processor.process(file_path, file_bytes, mode)
|
|
|
|
|
|
def extract_text_from_pdf(file_path: str = None, file_bytes: bytes = None) -> str:
|
|
"""Extrahiert Text aus einer PDF-Datei."""
|
|
result = process_file(file_path, file_bytes, ProcessingMode.TEXT_EXTRACT)
|
|
return result.text
|
|
|
|
|
|
def ocr_image(file_path: str = None, file_bytes: bytes = None) -> str:
|
|
"""Führt OCR auf einem Bild aus."""
|
|
result = process_file(file_path, file_bytes, ProcessingMode.OCR_PRINTED)
|
|
return result.text
|
|
|
|
|
|
def ocr_handwriting(file_path: str = None, file_bytes: bytes = None) -> str:
|
|
"""Führt Handschrift-OCR auf einem Bild aus."""
|
|
result = process_file(file_path, file_bytes, ProcessingMode.OCR_HANDWRITING)
|
|
return result.text
|