fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
563
backend/services/file_processor.py
Normal file
563
backend/services/file_processor.py
Normal file
@@ -0,0 +1,563 @@
|
||||
"""
|
||||
File Processor Service - Dokumentenverarbeitung für BreakPilot.
|
||||
|
||||
Shared Service für:
|
||||
- OCR (Optical Character Recognition) für Handschrift und gedruckten Text
|
||||
- PDF-Parsing und Textextraktion
|
||||
- Bildverarbeitung und -optimierung
|
||||
- DOCX/DOC Textextraktion
|
||||
|
||||
Verwendet:
|
||||
- PaddleOCR für deutsche Handschrift
|
||||
- PyMuPDF für PDF-Verarbeitung
|
||||
- python-docx für DOCX-Dateien
|
||||
- OpenCV für Bildvorverarbeitung
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import io
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Any, Tuple, Union
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileType(str, Enum):
|
||||
"""Unterstützte Dateitypen."""
|
||||
PDF = "pdf"
|
||||
IMAGE = "image"
|
||||
DOCX = "docx"
|
||||
DOC = "doc"
|
||||
TXT = "txt"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class ProcessingMode(str, Enum):
|
||||
"""Verarbeitungsmodi."""
|
||||
OCR_HANDWRITING = "ocr_handwriting" # Handschrifterkennung
|
||||
OCR_PRINTED = "ocr_printed" # Gedruckter Text
|
||||
TEXT_EXTRACT = "text_extract" # Textextraktion (PDF/DOCX)
|
||||
MIXED = "mixed" # Kombiniert OCR + Textextraktion
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessedRegion:
|
||||
"""Ein erkannter Textbereich."""
|
||||
text: str
|
||||
confidence: float
|
||||
bbox: Tuple[int, int, int, int] # x1, y1, x2, y2
|
||||
page: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingResult:
|
||||
"""Ergebnis der Dokumentenverarbeitung."""
|
||||
text: str
|
||||
confidence: float
|
||||
regions: List[ProcessedRegion]
|
||||
page_count: int
|
||||
file_type: FileType
|
||||
processing_mode: ProcessingMode
|
||||
metadata: Dict[str, Any]
|
||||
|
||||
|
||||
class FileProcessor:
|
||||
"""
|
||||
Zentrale Dokumentenverarbeitung für BreakPilot.
|
||||
|
||||
Unterstützt:
|
||||
- Handschrifterkennung (OCR) für Klausuren
|
||||
- Textextraktion aus PDFs
|
||||
- DOCX/DOC Verarbeitung
|
||||
- Bildvorverarbeitung für bessere OCR-Ergebnisse
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_lang: str = "de", use_gpu: bool = False):
|
||||
"""
|
||||
Initialisiert den File Processor.
|
||||
|
||||
Args:
|
||||
ocr_lang: Sprache für OCR (default: "de" für Deutsch)
|
||||
use_gpu: GPU für OCR nutzen (beschleunigt Verarbeitung)
|
||||
"""
|
||||
self.ocr_lang = ocr_lang
|
||||
self.use_gpu = use_gpu
|
||||
self._ocr_engine = None
|
||||
|
||||
logger.info(f"FileProcessor initialized (lang={ocr_lang}, gpu={use_gpu})")
|
||||
|
||||
@property
|
||||
def ocr_engine(self):
|
||||
"""Lazy-Loading des OCR-Engines."""
|
||||
if self._ocr_engine is None:
|
||||
self._ocr_engine = self._init_ocr_engine()
|
||||
return self._ocr_engine
|
||||
|
||||
def _init_ocr_engine(self):
|
||||
"""Initialisiert PaddleOCR oder Fallback."""
|
||||
try:
|
||||
from paddleocr import PaddleOCR
|
||||
return PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang='german', # Deutsch
|
||||
use_gpu=self.use_gpu,
|
||||
show_log=False
|
||||
)
|
||||
except ImportError:
|
||||
logger.warning("PaddleOCR nicht installiert - verwende Fallback")
|
||||
return None
|
||||
|
||||
def detect_file_type(self, file_path: str = None, file_bytes: bytes = None) -> FileType:
|
||||
"""
|
||||
Erkennt den Dateityp.
|
||||
|
||||
Args:
|
||||
file_path: Pfad zur Datei
|
||||
file_bytes: Dateiinhalt als Bytes
|
||||
|
||||
Returns:
|
||||
FileType enum
|
||||
"""
|
||||
if file_path:
|
||||
ext = Path(file_path).suffix.lower()
|
||||
if ext == ".pdf":
|
||||
return FileType.PDF
|
||||
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".gif"]:
|
||||
return FileType.IMAGE
|
||||
elif ext == ".docx":
|
||||
return FileType.DOCX
|
||||
elif ext == ".doc":
|
||||
return FileType.DOC
|
||||
elif ext == ".txt":
|
||||
return FileType.TXT
|
||||
|
||||
if file_bytes:
|
||||
# Magic number detection
|
||||
if file_bytes[:4] == b'%PDF':
|
||||
return FileType.PDF
|
||||
elif file_bytes[:8] == b'\x89PNG\r\n\x1a\n':
|
||||
return FileType.IMAGE
|
||||
elif file_bytes[:2] in [b'\xff\xd8', b'BM']: # JPEG, BMP
|
||||
return FileType.IMAGE
|
||||
elif file_bytes[:4] == b'PK\x03\x04': # ZIP (DOCX)
|
||||
return FileType.DOCX
|
||||
|
||||
return FileType.UNKNOWN
|
||||
|
||||
def process(
|
||||
self,
|
||||
file_path: str = None,
|
||||
file_bytes: bytes = None,
|
||||
mode: ProcessingMode = ProcessingMode.MIXED
|
||||
) -> ProcessingResult:
|
||||
"""
|
||||
Verarbeitet ein Dokument.
|
||||
|
||||
Args:
|
||||
file_path: Pfad zur Datei
|
||||
file_bytes: Dateiinhalt als Bytes
|
||||
mode: Verarbeitungsmodus
|
||||
|
||||
Returns:
|
||||
ProcessingResult mit extrahiertem Text und Metadaten
|
||||
"""
|
||||
if not file_path and not file_bytes:
|
||||
raise ValueError("Entweder file_path oder file_bytes muss angegeben werden")
|
||||
|
||||
file_type = self.detect_file_type(file_path, file_bytes)
|
||||
logger.info(f"Processing file of type: {file_type}")
|
||||
|
||||
if file_type == FileType.PDF:
|
||||
return self._process_pdf(file_path, file_bytes, mode)
|
||||
elif file_type == FileType.IMAGE:
|
||||
return self._process_image(file_path, file_bytes, mode)
|
||||
elif file_type == FileType.DOCX:
|
||||
return self._process_docx(file_path, file_bytes)
|
||||
elif file_type == FileType.TXT:
|
||||
return self._process_txt(file_path, file_bytes)
|
||||
else:
|
||||
raise ValueError(f"Nicht unterstützter Dateityp: {file_type}")
|
||||
|
||||
def _process_pdf(
|
||||
self,
|
||||
file_path: str = None,
|
||||
file_bytes: bytes = None,
|
||||
mode: ProcessingMode = ProcessingMode.MIXED
|
||||
) -> ProcessingResult:
|
||||
"""Verarbeitet PDF-Dateien."""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
logger.warning("PyMuPDF nicht installiert - versuche Fallback")
|
||||
# Fallback: PDF als Bild behandeln
|
||||
return self._process_image(file_path, file_bytes, mode)
|
||||
|
||||
if file_bytes:
|
||||
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
||||
else:
|
||||
doc = fitz.open(file_path)
|
||||
|
||||
all_text = []
|
||||
all_regions = []
|
||||
total_confidence = 0.0
|
||||
region_count = 0
|
||||
|
||||
for page_num, page in enumerate(doc, start=1):
|
||||
# Erst versuchen Text direkt zu extrahieren
|
||||
page_text = page.get_text()
|
||||
|
||||
if page_text.strip() and mode != ProcessingMode.OCR_HANDWRITING:
|
||||
# PDF enthält Text (nicht nur Bilder)
|
||||
all_text.append(page_text)
|
||||
all_regions.append(ProcessedRegion(
|
||||
text=page_text,
|
||||
confidence=1.0,
|
||||
bbox=(0, 0, int(page.rect.width), int(page.rect.height)),
|
||||
page=page_num
|
||||
))
|
||||
total_confidence += 1.0
|
||||
region_count += 1
|
||||
else:
|
||||
# Seite als Bild rendern und OCR anwenden
|
||||
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x Auflösung
|
||||
img_bytes = pix.tobytes("png")
|
||||
img = Image.open(io.BytesIO(img_bytes))
|
||||
|
||||
ocr_result = self._ocr_image(img)
|
||||
all_text.append(ocr_result["text"])
|
||||
|
||||
for region in ocr_result["regions"]:
|
||||
region.page = page_num
|
||||
all_regions.append(region)
|
||||
total_confidence += region.confidence
|
||||
region_count += 1
|
||||
|
||||
doc.close()
|
||||
|
||||
avg_confidence = total_confidence / region_count if region_count > 0 else 0.0
|
||||
|
||||
return ProcessingResult(
|
||||
text="\n\n".join(all_text),
|
||||
confidence=avg_confidence,
|
||||
regions=all_regions,
|
||||
page_count=len(doc) if hasattr(doc, '__len__') else 1,
|
||||
file_type=FileType.PDF,
|
||||
processing_mode=mode,
|
||||
metadata={"source": file_path or "bytes"}
|
||||
)
|
||||
|
||||
def _process_image(
|
||||
self,
|
||||
file_path: str = None,
|
||||
file_bytes: bytes = None,
|
||||
mode: ProcessingMode = ProcessingMode.MIXED
|
||||
) -> ProcessingResult:
|
||||
"""Verarbeitet Bilddateien."""
|
||||
if file_bytes:
|
||||
img = Image.open(io.BytesIO(file_bytes))
|
||||
else:
|
||||
img = Image.open(file_path)
|
||||
|
||||
# Bildvorverarbeitung
|
||||
processed_img = self._preprocess_image(img)
|
||||
|
||||
# OCR
|
||||
ocr_result = self._ocr_image(processed_img)
|
||||
|
||||
return ProcessingResult(
|
||||
text=ocr_result["text"],
|
||||
confidence=ocr_result["confidence"],
|
||||
regions=ocr_result["regions"],
|
||||
page_count=1,
|
||||
file_type=FileType.IMAGE,
|
||||
processing_mode=mode,
|
||||
metadata={
|
||||
"source": file_path or "bytes",
|
||||
"image_size": img.size
|
||||
}
|
||||
)
|
||||
|
||||
def _process_docx(
|
||||
self,
|
||||
file_path: str = None,
|
||||
file_bytes: bytes = None
|
||||
) -> ProcessingResult:
|
||||
"""Verarbeitet DOCX-Dateien."""
|
||||
try:
|
||||
from docx import Document
|
||||
except ImportError:
|
||||
raise ImportError("python-docx ist nicht installiert")
|
||||
|
||||
if file_bytes:
|
||||
doc = Document(io.BytesIO(file_bytes))
|
||||
else:
|
||||
doc = Document(file_path)
|
||||
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append(para.text)
|
||||
|
||||
# Auch Tabellen extrahieren
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
row_text = " | ".join(cell.text for cell in row.cells)
|
||||
if row_text.strip():
|
||||
paragraphs.append(row_text)
|
||||
|
||||
text = "\n\n".join(paragraphs)
|
||||
|
||||
return ProcessingResult(
|
||||
text=text,
|
||||
confidence=1.0, # Direkte Textextraktion
|
||||
regions=[ProcessedRegion(
|
||||
text=text,
|
||||
confidence=1.0,
|
||||
bbox=(0, 0, 0, 0),
|
||||
page=1
|
||||
)],
|
||||
page_count=1,
|
||||
file_type=FileType.DOCX,
|
||||
processing_mode=ProcessingMode.TEXT_EXTRACT,
|
||||
metadata={"source": file_path or "bytes"}
|
||||
)
|
||||
|
||||
def _process_txt(
|
||||
self,
|
||||
file_path: str = None,
|
||||
file_bytes: bytes = None
|
||||
) -> ProcessingResult:
|
||||
"""Verarbeitet Textdateien."""
|
||||
if file_bytes:
|
||||
text = file_bytes.decode('utf-8', errors='ignore')
|
||||
else:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
text = f.read()
|
||||
|
||||
return ProcessingResult(
|
||||
text=text,
|
||||
confidence=1.0,
|
||||
regions=[ProcessedRegion(
|
||||
text=text,
|
||||
confidence=1.0,
|
||||
bbox=(0, 0, 0, 0),
|
||||
page=1
|
||||
)],
|
||||
page_count=1,
|
||||
file_type=FileType.TXT,
|
||||
processing_mode=ProcessingMode.TEXT_EXTRACT,
|
||||
metadata={"source": file_path or "bytes"}
|
||||
)
|
||||
|
||||
def _preprocess_image(self, img: Image.Image) -> Image.Image:
|
||||
"""
|
||||
Vorverarbeitung des Bildes für bessere OCR-Ergebnisse.
|
||||
|
||||
- Konvertierung zu Graustufen
|
||||
- Kontrastverstärkung
|
||||
- Rauschunterdrückung
|
||||
- Binarisierung
|
||||
"""
|
||||
# PIL zu OpenCV
|
||||
cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||||
|
||||
# Zu Graustufen konvertieren
|
||||
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Rauschunterdrückung
|
||||
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
||||
|
||||
# Kontrastverstärkung (CLAHE)
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
enhanced = clahe.apply(denoised)
|
||||
|
||||
# Adaptive Binarisierung
|
||||
binary = cv2.adaptiveThreshold(
|
||||
enhanced,
|
||||
255,
|
||||
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY,
|
||||
11,
|
||||
2
|
||||
)
|
||||
|
||||
# Zurück zu PIL
|
||||
return Image.fromarray(binary)
|
||||
|
||||
def _ocr_image(self, img: Image.Image) -> Dict[str, Any]:
|
||||
"""
|
||||
Führt OCR auf einem Bild aus.
|
||||
|
||||
Returns:
|
||||
Dict mit text, confidence und regions
|
||||
"""
|
||||
if self.ocr_engine is None:
|
||||
# Fallback wenn kein OCR-Engine verfügbar
|
||||
return {
|
||||
"text": "[OCR nicht verfügbar - bitte PaddleOCR installieren]",
|
||||
"confidence": 0.0,
|
||||
"regions": []
|
||||
}
|
||||
|
||||
# PIL zu numpy array
|
||||
img_array = np.array(img)
|
||||
|
||||
# Wenn Graustufen, zu RGB konvertieren (PaddleOCR erwartet RGB)
|
||||
if len(img_array.shape) == 2:
|
||||
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
||||
|
||||
# OCR ausführen
|
||||
result = self.ocr_engine.ocr(img_array, cls=True)
|
||||
|
||||
if not result or not result[0]:
|
||||
return {"text": "", "confidence": 0.0, "regions": []}
|
||||
|
||||
all_text = []
|
||||
all_regions = []
|
||||
total_confidence = 0.0
|
||||
|
||||
for line in result[0]:
|
||||
bbox_points = line[0] # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
text, confidence = line[1]
|
||||
|
||||
# Bounding Box zu x1, y1, x2, y2 konvertieren
|
||||
x_coords = [p[0] for p in bbox_points]
|
||||
y_coords = [p[1] for p in bbox_points]
|
||||
bbox = (
|
||||
int(min(x_coords)),
|
||||
int(min(y_coords)),
|
||||
int(max(x_coords)),
|
||||
int(max(y_coords))
|
||||
)
|
||||
|
||||
all_text.append(text)
|
||||
all_regions.append(ProcessedRegion(
|
||||
text=text,
|
||||
confidence=confidence,
|
||||
bbox=bbox
|
||||
))
|
||||
total_confidence += confidence
|
||||
|
||||
avg_confidence = total_confidence / len(all_regions) if all_regions else 0.0
|
||||
|
||||
return {
|
||||
"text": "\n".join(all_text),
|
||||
"confidence": avg_confidence,
|
||||
"regions": all_regions
|
||||
}
|
||||
|
||||
def extract_handwriting_regions(
|
||||
self,
|
||||
img: Image.Image,
|
||||
min_area: int = 500
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Erkennt und extrahiert handschriftliche Bereiche aus einem Bild.
|
||||
|
||||
Nützlich für Klausuren mit gedruckten Fragen und handschriftlichen Antworten.
|
||||
|
||||
Args:
|
||||
img: Eingabebild
|
||||
min_area: Minimale Fläche für erkannte Regionen
|
||||
|
||||
Returns:
|
||||
Liste von Regionen mit Koordinaten und erkanntem Text
|
||||
"""
|
||||
# Bildvorverarbeitung
|
||||
cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
||||
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Kanten erkennen
|
||||
edges = cv2.Canny(gray, 50, 150)
|
||||
|
||||
# Morphologische Operationen zum Verbinden
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 5))
|
||||
dilated = cv2.dilate(edges, kernel, iterations=2)
|
||||
|
||||
# Konturen finden
|
||||
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
regions = []
|
||||
for contour in contours:
|
||||
area = cv2.contourArea(contour)
|
||||
if area < min_area:
|
||||
continue
|
||||
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
# Region ausschneiden
|
||||
region_img = img.crop((x, y, x + w, y + h))
|
||||
|
||||
# OCR auf Region anwenden
|
||||
ocr_result = self._ocr_image(region_img)
|
||||
|
||||
regions.append({
|
||||
"bbox": (x, y, x + w, y + h),
|
||||
"area": area,
|
||||
"text": ocr_result["text"],
|
||||
"confidence": ocr_result["confidence"]
|
||||
})
|
||||
|
||||
# Nach Y-Position sortieren (oben nach unten)
|
||||
regions.sort(key=lambda r: r["bbox"][1])
|
||||
|
||||
return regions
|
||||
|
||||
|
||||
# Singleton-Instanz
|
||||
_file_processor: Optional[FileProcessor] = None
|
||||
|
||||
|
||||
def get_file_processor() -> FileProcessor:
|
||||
"""Gibt Singleton-Instanz des File Processors zurück."""
|
||||
global _file_processor
|
||||
if _file_processor is None:
|
||||
_file_processor = FileProcessor()
|
||||
return _file_processor
|
||||
|
||||
|
||||
# Convenience functions
|
||||
def process_file(
|
||||
file_path: str = None,
|
||||
file_bytes: bytes = None,
|
||||
mode: ProcessingMode = ProcessingMode.MIXED
|
||||
) -> ProcessingResult:
|
||||
"""
|
||||
Convenience function zum Verarbeiten einer Datei.
|
||||
|
||||
Args:
|
||||
file_path: Pfad zur Datei
|
||||
file_bytes: Dateiinhalt als Bytes
|
||||
mode: Verarbeitungsmodus
|
||||
|
||||
Returns:
|
||||
ProcessingResult
|
||||
"""
|
||||
processor = get_file_processor()
|
||||
return processor.process(file_path, file_bytes, mode)
|
||||
|
||||
|
||||
def extract_text_from_pdf(file_path: str = None, file_bytes: bytes = None) -> str:
|
||||
"""Extrahiert Text aus einer PDF-Datei."""
|
||||
result = process_file(file_path, file_bytes, ProcessingMode.TEXT_EXTRACT)
|
||||
return result.text
|
||||
|
||||
|
||||
def ocr_image(file_path: str = None, file_bytes: bytes = None) -> str:
|
||||
"""Führt OCR auf einem Bild aus."""
|
||||
result = process_file(file_path, file_bytes, ProcessingMode.OCR_PRINTED)
|
||||
return result.text
|
||||
|
||||
|
||||
def ocr_handwriting(file_path: str = None, file_bytes: bytes = None) -> str:
|
||||
"""Führt Handschrift-OCR auf einem Bild aus."""
|
||||
result = process_file(file_path, file_bytes, ProcessingMode.OCR_HANDWRITING)
|
||||
return result.text
|
||||
Reference in New Issue
Block a user