fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
197
ai-content-generator/app/services/material_analyzer.py
Normal file
197
ai-content-generator/app/services/material_analyzer.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
Material Analyzer
|
||||
Analysiert hochgeladene Lernmaterialien (PDF, Images, DOCX)
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
import io
|
||||
from PyPDF2 import PdfReader
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
from docx import Document
|
||||
import mammoth
|
||||
|
||||
|
||||
class MaterialAnalyzer:
|
||||
"""Analyzer für verschiedene Material-Typen"""
|
||||
|
||||
async def analyze(self, filename: str, content: bytes) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze uploaded material
|
||||
|
||||
Args:
|
||||
filename: Name der Datei
|
||||
content: Datei-Content als bytes
|
||||
|
||||
Returns:
|
||||
Strukturierte Material-Daten
|
||||
"""
|
||||
file_ext = filename.lower().split('.')[-1]
|
||||
|
||||
try:
|
||||
if file_ext == 'pdf':
|
||||
return await self._analyze_pdf(filename, content)
|
||||
elif file_ext in ['png', 'jpg', 'jpeg']:
|
||||
return await self._analyze_image(filename, content)
|
||||
elif file_ext == 'docx':
|
||||
return await self._analyze_docx(filename, content)
|
||||
elif file_ext == 'txt':
|
||||
return await self._analyze_text(filename, content)
|
||||
else:
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "unknown",
|
||||
"content": "",
|
||||
"error": f"Unsupported file type: {file_ext}"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "error",
|
||||
"content": "",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def _analyze_pdf(self, filename: str, content: bytes) -> Dict[str, Any]:
|
||||
"""Extract text from PDF"""
|
||||
try:
|
||||
pdf_file = io.BytesIO(content)
|
||||
reader = PdfReader(pdf_file)
|
||||
|
||||
text_content = []
|
||||
num_pages = len(reader.pages)
|
||||
|
||||
for page_num, page in enumerate(reader.pages, 1):
|
||||
text = page.extract_text()
|
||||
if text.strip():
|
||||
text_content.append(f"--- Seite {page_num} ---")
|
||||
text_content.append(text)
|
||||
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "pdf",
|
||||
"num_pages": num_pages,
|
||||
"content": "\n".join(text_content),
|
||||
"success": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "pdf",
|
||||
"content": "",
|
||||
"error": f"PDF extraction failed: {str(e)}"
|
||||
}
|
||||
|
||||
async def _analyze_image(self, filename: str, content: bytes) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze image - OCR for text extraction
|
||||
Note: Requires tesseract installed
|
||||
"""
|
||||
try:
|
||||
image = Image.open(io.BytesIO(content))
|
||||
|
||||
# Image metadata
|
||||
width, height = image.size
|
||||
mode = image.mode
|
||||
|
||||
# OCR text extraction (if tesseract available)
|
||||
ocr_text = ""
|
||||
try:
|
||||
ocr_text = pytesseract.image_to_string(image, lang='deu')
|
||||
except Exception as ocr_error:
|
||||
ocr_text = f"[OCR not available: {str(ocr_error)}]"
|
||||
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "image",
|
||||
"width": width,
|
||||
"height": height,
|
||||
"mode": mode,
|
||||
"content": ocr_text,
|
||||
"note": "Image als Diagramm/Skizze erkannt. OCR Text extrahiert.",
|
||||
"success": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "image",
|
||||
"content": "",
|
||||
"error": f"Image analysis failed: {str(e)}"
|
||||
}
|
||||
|
||||
async def _analyze_docx(self, filename: str, content: bytes) -> Dict[str, Any]:
|
||||
"""Extract text from DOCX"""
|
||||
try:
|
||||
# Methode 1: python-docx
|
||||
try:
|
||||
doc = Document(io.BytesIO(content))
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append(para.text)
|
||||
|
||||
text_content = "\n".join(paragraphs)
|
||||
|
||||
except:
|
||||
# Methode 2: mammoth (bessere Formatierung)
|
||||
result = mammoth.convert_to_text(io.BytesIO(content))
|
||||
text_content = result.value
|
||||
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "docx",
|
||||
"content": text_content,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "docx",
|
||||
"content": "",
|
||||
"error": f"DOCX extraction failed: {str(e)}"
|
||||
}
|
||||
|
||||
async def _analyze_text(self, filename: str, content: bytes) -> Dict[str, Any]:
|
||||
"""Extract text from plain text file"""
|
||||
try:
|
||||
text = content.decode('utf-8')
|
||||
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "text",
|
||||
"content": text,
|
||||
"success": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"filename": filename,
|
||||
"type": "text",
|
||||
"content": "",
|
||||
"error": f"Text extraction failed: {str(e)}"
|
||||
}
|
||||
|
||||
def extract_key_concepts(self, materials: list[Dict[str, Any]]) -> list[str]:
|
||||
"""
|
||||
Extract key concepts from materials
|
||||
Simple heuristic: Find capitalized words, frequent terms
|
||||
|
||||
In production: Use Claude AI for better concept extraction
|
||||
"""
|
||||
all_text = " ".join([m.get("content", "") for m in materials])
|
||||
|
||||
# Simple extraction: Capitalized words (potential concepts)
|
||||
import re
|
||||
words = re.findall(r'\b[A-ZÄÖÜ][a-zäöüß]+\b', all_text)
|
||||
|
||||
# Count frequency
|
||||
from collections import Counter
|
||||
word_counts = Counter(words)
|
||||
|
||||
# Return top 20 concepts
|
||||
concepts = [word for word, count in word_counts.most_common(20)]
|
||||
return concepts
|
||||
Reference in New Issue
Block a user