A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
198 lines
6.1 KiB
Python
198 lines
6.1 KiB
Python
"""
|
|
Material Analyzer
|
|
Analysiert hochgeladene Lernmaterialien (PDF, Images, DOCX)
|
|
"""
|
|
|
|
from typing import Dict, Any, Optional
|
|
import io
|
|
from PyPDF2 import PdfReader
|
|
from PIL import Image
|
|
import pytesseract
|
|
from docx import Document
|
|
import mammoth
|
|
|
|
|
|
class MaterialAnalyzer:
|
|
"""Analyzer für verschiedene Material-Typen"""
|
|
|
|
async def analyze(self, filename: str, content: bytes) -> Dict[str, Any]:
|
|
"""
|
|
Analyze uploaded material
|
|
|
|
Args:
|
|
filename: Name der Datei
|
|
content: Datei-Content als bytes
|
|
|
|
Returns:
|
|
Strukturierte Material-Daten
|
|
"""
|
|
file_ext = filename.lower().split('.')[-1]
|
|
|
|
try:
|
|
if file_ext == 'pdf':
|
|
return await self._analyze_pdf(filename, content)
|
|
elif file_ext in ['png', 'jpg', 'jpeg']:
|
|
return await self._analyze_image(filename, content)
|
|
elif file_ext == 'docx':
|
|
return await self._analyze_docx(filename, content)
|
|
elif file_ext == 'txt':
|
|
return await self._analyze_text(filename, content)
|
|
else:
|
|
return {
|
|
"filename": filename,
|
|
"type": "unknown",
|
|
"content": "",
|
|
"error": f"Unsupported file type: {file_ext}"
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"filename": filename,
|
|
"type": "error",
|
|
"content": "",
|
|
"error": str(e)
|
|
}
|
|
|
|
async def _analyze_pdf(self, filename: str, content: bytes) -> Dict[str, Any]:
|
|
"""Extract text from PDF"""
|
|
try:
|
|
pdf_file = io.BytesIO(content)
|
|
reader = PdfReader(pdf_file)
|
|
|
|
text_content = []
|
|
num_pages = len(reader.pages)
|
|
|
|
for page_num, page in enumerate(reader.pages, 1):
|
|
text = page.extract_text()
|
|
if text.strip():
|
|
text_content.append(f"--- Seite {page_num} ---")
|
|
text_content.append(text)
|
|
|
|
return {
|
|
"filename": filename,
|
|
"type": "pdf",
|
|
"num_pages": num_pages,
|
|
"content": "\n".join(text_content),
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"filename": filename,
|
|
"type": "pdf",
|
|
"content": "",
|
|
"error": f"PDF extraction failed: {str(e)}"
|
|
}
|
|
|
|
async def _analyze_image(self, filename: str, content: bytes) -> Dict[str, Any]:
|
|
"""
|
|
Analyze image - OCR for text extraction
|
|
Note: Requires tesseract installed
|
|
"""
|
|
try:
|
|
image = Image.open(io.BytesIO(content))
|
|
|
|
# Image metadata
|
|
width, height = image.size
|
|
mode = image.mode
|
|
|
|
# OCR text extraction (if tesseract available)
|
|
ocr_text = ""
|
|
try:
|
|
ocr_text = pytesseract.image_to_string(image, lang='deu')
|
|
except Exception as ocr_error:
|
|
ocr_text = f"[OCR not available: {str(ocr_error)}]"
|
|
|
|
return {
|
|
"filename": filename,
|
|
"type": "image",
|
|
"width": width,
|
|
"height": height,
|
|
"mode": mode,
|
|
"content": ocr_text,
|
|
"note": "Image als Diagramm/Skizze erkannt. OCR Text extrahiert.",
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"filename": filename,
|
|
"type": "image",
|
|
"content": "",
|
|
"error": f"Image analysis failed: {str(e)}"
|
|
}
|
|
|
|
async def _analyze_docx(self, filename: str, content: bytes) -> Dict[str, Any]:
|
|
"""Extract text from DOCX"""
|
|
try:
|
|
# Methode 1: python-docx
|
|
try:
|
|
doc = Document(io.BytesIO(content))
|
|
paragraphs = []
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
paragraphs.append(para.text)
|
|
|
|
text_content = "\n".join(paragraphs)
|
|
|
|
except:
|
|
# Methode 2: mammoth (bessere Formatierung)
|
|
result = mammoth.convert_to_text(io.BytesIO(content))
|
|
text_content = result.value
|
|
|
|
return {
|
|
"filename": filename,
|
|
"type": "docx",
|
|
"content": text_content,
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"filename": filename,
|
|
"type": "docx",
|
|
"content": "",
|
|
"error": f"DOCX extraction failed: {str(e)}"
|
|
}
|
|
|
|
async def _analyze_text(self, filename: str, content: bytes) -> Dict[str, Any]:
|
|
"""Extract text from plain text file"""
|
|
try:
|
|
text = content.decode('utf-8')
|
|
|
|
return {
|
|
"filename": filename,
|
|
"type": "text",
|
|
"content": text,
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"filename": filename,
|
|
"type": "text",
|
|
"content": "",
|
|
"error": f"Text extraction failed: {str(e)}"
|
|
}
|
|
|
|
def extract_key_concepts(self, materials: list[Dict[str, Any]]) -> list[str]:
|
|
"""
|
|
Extract key concepts from materials
|
|
Simple heuristic: Find capitalized words, frequent terms
|
|
|
|
In production: Use Claude AI for better concept extraction
|
|
"""
|
|
all_text = " ".join([m.get("content", "") for m in materials])
|
|
|
|
# Simple extraction: Capitalized words (potential concepts)
|
|
import re
|
|
words = re.findall(r'\b[A-ZÄÖÜ][a-zäöüß]+\b', all_text)
|
|
|
|
# Count frequency
|
|
from collections import Counter
|
|
word_counts = Counter(words)
|
|
|
|
# Return top 20 concepts
|
|
concepts = [word for word, count in word_counts.most_common(20)]
|
|
return concepts
|