""" Material Analyzer Analysiert hochgeladene Lernmaterialien (PDF, Images, DOCX) """ from typing import Dict, Any, Optional import io from PyPDF2 import PdfReader from PIL import Image import pytesseract from docx import Document import mammoth class MaterialAnalyzer: """Analyzer für verschiedene Material-Typen""" async def analyze(self, filename: str, content: bytes) -> Dict[str, Any]: """ Analyze uploaded material Args: filename: Name der Datei content: Datei-Content als bytes Returns: Strukturierte Material-Daten """ file_ext = filename.lower().split('.')[-1] try: if file_ext == 'pdf': return await self._analyze_pdf(filename, content) elif file_ext in ['png', 'jpg', 'jpeg']: return await self._analyze_image(filename, content) elif file_ext == 'docx': return await self._analyze_docx(filename, content) elif file_ext == 'txt': return await self._analyze_text(filename, content) else: return { "filename": filename, "type": "unknown", "content": "", "error": f"Unsupported file type: {file_ext}" } except Exception as e: return { "filename": filename, "type": "error", "content": "", "error": str(e) } async def _analyze_pdf(self, filename: str, content: bytes) -> Dict[str, Any]: """Extract text from PDF""" try: pdf_file = io.BytesIO(content) reader = PdfReader(pdf_file) text_content = [] num_pages = len(reader.pages) for page_num, page in enumerate(reader.pages, 1): text = page.extract_text() if text.strip(): text_content.append(f"--- Seite {page_num} ---") text_content.append(text) return { "filename": filename, "type": "pdf", "num_pages": num_pages, "content": "\n".join(text_content), "success": True } except Exception as e: return { "filename": filename, "type": "pdf", "content": "", "error": f"PDF extraction failed: {str(e)}" } async def _analyze_image(self, filename: str, content: bytes) -> Dict[str, Any]: """ Analyze image - OCR for text extraction Note: Requires tesseract installed """ try: image = Image.open(io.BytesIO(content)) # Image metadata width, height = image.size mode = image.mode # OCR text extraction (if tesseract available) ocr_text = "" try: ocr_text = pytesseract.image_to_string(image, lang='deu') except Exception as ocr_error: ocr_text = f"[OCR not available: {str(ocr_error)}]" return { "filename": filename, "type": "image", "width": width, "height": height, "mode": mode, "content": ocr_text, "note": "Image als Diagramm/Skizze erkannt. OCR Text extrahiert.", "success": True } except Exception as e: return { "filename": filename, "type": "image", "content": "", "error": f"Image analysis failed: {str(e)}" } async def _analyze_docx(self, filename: str, content: bytes) -> Dict[str, Any]: """Extract text from DOCX""" try: # Methode 1: python-docx try: doc = Document(io.BytesIO(content)) paragraphs = [] for para in doc.paragraphs: if para.text.strip(): paragraphs.append(para.text) text_content = "\n".join(paragraphs) except: # Methode 2: mammoth (bessere Formatierung) result = mammoth.convert_to_text(io.BytesIO(content)) text_content = result.value return { "filename": filename, "type": "docx", "content": text_content, "success": True } except Exception as e: return { "filename": filename, "type": "docx", "content": "", "error": f"DOCX extraction failed: {str(e)}" } async def _analyze_text(self, filename: str, content: bytes) -> Dict[str, Any]: """Extract text from plain text file""" try: text = content.decode('utf-8') return { "filename": filename, "type": "text", "content": text, "success": True } except Exception as e: return { "filename": filename, "type": "text", "content": "", "error": f"Text extraction failed: {str(e)}" } def extract_key_concepts(self, materials: list[Dict[str, Any]]) -> list[str]: """ Extract key concepts from materials Simple heuristic: Find capitalized words, frequent terms In production: Use Claude AI for better concept extraction """ all_text = " ".join([m.get("content", "") for m in materials]) # Simple extraction: Capitalized words (potential concepts) import re words = re.findall(r'\b[A-ZÄÖÜ][a-zäöüß]+\b', all_text) # Count frequency from collections import Counter word_counts = Counter(words) # Return top 20 concepts concepts = [word for word, count in word_counts.most_common(20)] return concepts