breakpilot-pwa/ai-content-generator/app/services/material_analyzer.py

"""
Material Analyzer
Analysiert hochgeladene Lernmaterialien (PDF, Images, DOCX)
"""

from typing import Dict, Any, Optional
import io
from PyPDF2 import PdfReader
from PIL import Image
import pytesseract
from docx import Document
import mammoth


class MaterialAnalyzer:
    """Analyzer für verschiedene Material-Typen"""

    async def analyze(self, filename: str, content: bytes) -> Dict[str, Any]:
        """
        Analyze uploaded material

        Args:
            filename: Name der Datei
            content: Datei-Content als bytes

        Returns:
            Strukturierte Material-Daten
        """
        file_ext = filename.lower().split('.')[-1]

        try:
            if file_ext == 'pdf':
                return await self._analyze_pdf(filename, content)
            elif file_ext in ['png', 'jpg', 'jpeg']:
                return await self._analyze_image(filename, content)
            elif file_ext == 'docx':
                return await self._analyze_docx(filename, content)
            elif file_ext == 'txt':
                return await self._analyze_text(filename, content)
            else:
                return {
                    "filename": filename,
                    "type": "unknown",
                    "content": "",
                    "error": f"Unsupported file type: {file_ext}"
                }

        except Exception as e:
            return {
                "filename": filename,
                "type": "error",
                "content": "",
                "error": str(e)
            }

    async def _analyze_pdf(self, filename: str, content: bytes) -> Dict[str, Any]:
        """Extract text from PDF"""
        try:
            pdf_file = io.BytesIO(content)
            reader = PdfReader(pdf_file)

            text_content = []
            num_pages = len(reader.pages)

            for page_num, page in enumerate(reader.pages, 1):
                text = page.extract_text()
                if text.strip():
                    text_content.append(f"--- Seite {page_num} ---")
                    text_content.append(text)

            return {
                "filename": filename,
                "type": "pdf",
                "num_pages": num_pages,
                "content": "\n".join(text_content),
                "success": True
            }

        except Exception as e:
            return {
                "filename": filename,
                "type": "pdf",
                "content": "",
                "error": f"PDF extraction failed: {str(e)}"
            }

    async def _analyze_image(self, filename: str, content: bytes) -> Dict[str, Any]:
        """
        Analyze image - OCR for text extraction
        Note: Requires tesseract installed
        """
        try:
            image = Image.open(io.BytesIO(content))

            # Image metadata
            width, height = image.size
            mode = image.mode

            # OCR text extraction (if tesseract available)
            ocr_text = ""
            try:
                ocr_text = pytesseract.image_to_string(image, lang='deu')
            except Exception as ocr_error:
                ocr_text = f"[OCR not available: {str(ocr_error)}]"

            return {
                "filename": filename,
                "type": "image",
                "width": width,
                "height": height,
                "mode": mode,
                "content": ocr_text,
                "note": "Image als Diagramm/Skizze erkannt. OCR Text extrahiert.",
                "success": True
            }

        except Exception as e:
            return {
                "filename": filename,
                "type": "image",
                "content": "",
                "error": f"Image analysis failed: {str(e)}"
            }

    async def _analyze_docx(self, filename: str, content: bytes) -> Dict[str, Any]:
        """Extract text from DOCX"""
        try:
            # Methode 1: python-docx
            try:
                doc = Document(io.BytesIO(content))
                paragraphs = []
                for para in doc.paragraphs:
                    if para.text.strip():
                        paragraphs.append(para.text)

                text_content = "\n".join(paragraphs)

            except:
                # Methode 2: mammoth (bessere Formatierung)
                result = mammoth.convert_to_text(io.BytesIO(content))
                text_content = result.value

            return {
                "filename": filename,
                "type": "docx",
                "content": text_content,
                "success": True
            }

        except Exception as e:
            return {
                "filename": filename,
                "type": "docx",
                "content": "",
                "error": f"DOCX extraction failed: {str(e)}"
            }

    async def _analyze_text(self, filename: str, content: bytes) -> Dict[str, Any]:
        """Extract text from plain text file"""
        try:
            text = content.decode('utf-8')

            return {
                "filename": filename,
                "type": "text",
                "content": text,
                "success": True
            }

        except Exception as e:
            return {
                "filename": filename,
                "type": "text",
                "content": "",
                "error": f"Text extraction failed: {str(e)}"
            }

    def extract_key_concepts(self, materials: list[Dict[str, Any]]) -> list[str]:
        """
        Extract key concepts from materials
        Simple heuristic: Find capitalized words, frequent terms

        In production: Use Claude AI for better concept extraction
        """
        all_text = " ".join([m.get("content", "") for m in materials])

        # Simple extraction: Capitalized words (potential concepts)
        import re
        words = re.findall(r'\b[A-ZÄÖÜ][a-zäöüß]+\b', all_text)

        # Count frequency
        from collections import Counter
        word_counts = Counter(words)

        # Return top 20 concepts
        concepts = [word for word, count in word_counts.most_common(20)]
        return concepts