fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/ai-content-generator/app/services/material_analyzer.py
+++ b/ai-content-generator/app/services/material_analyzer.py
@@ -0,0 +1,197 @@
+"""
+Material Analyzer
+Analysiert hochgeladene Lernmaterialien (PDF, Images, DOCX)
+"""
+
+from typing import Dict, Any, Optional
+import io
+from PyPDF2 import PdfReader
+from PIL import Image
+import pytesseract
+from docx import Document
+import mammoth
+
+
+class MaterialAnalyzer:
+    """Analyzer für verschiedene Material-Typen"""
+
+    async def analyze(self, filename: str, content: bytes) -> Dict[str, Any]:
+        """
+        Analyze uploaded material
+
+        Args:
+            filename: Name der Datei
+            content: Datei-Content als bytes
+
+        Returns:
+            Strukturierte Material-Daten
+        """
+        file_ext = filename.lower().split('.')[-1]
+
+        try:
+            if file_ext == 'pdf':
+                return await self._analyze_pdf(filename, content)
+            elif file_ext in ['png', 'jpg', 'jpeg']:
+                return await self._analyze_image(filename, content)
+            elif file_ext == 'docx':
+                return await self._analyze_docx(filename, content)
+            elif file_ext == 'txt':
+                return await self._analyze_text(filename, content)
+            else:
+                return {
+                    "filename": filename,
+                    "type": "unknown",
+                    "content": "",
+                    "error": f"Unsupported file type: {file_ext}"
+                }
+
+        except Exception as e:
+            return {
+                "filename": filename,
+                "type": "error",
+                "content": "",
+                "error": str(e)
+            }
+
+    async def _analyze_pdf(self, filename: str, content: bytes) -> Dict[str, Any]:
+        """Extract text from PDF"""
+        try:
+            pdf_file = io.BytesIO(content)
+            reader = PdfReader(pdf_file)
+
+            text_content = []
+            num_pages = len(reader.pages)
+
+            for page_num, page in enumerate(reader.pages, 1):
+                text = page.extract_text()
+                if text.strip():
+                    text_content.append(f"--- Seite {page_num} ---")
+                    text_content.append(text)
+
+            return {
+                "filename": filename,
+                "type": "pdf",
+                "num_pages": num_pages,
+                "content": "\n".join(text_content),
+                "success": True
+            }
+
+        except Exception as e:
+            return {
+                "filename": filename,
+                "type": "pdf",
+                "content": "",
+                "error": f"PDF extraction failed: {str(e)}"
+            }
+
+    async def _analyze_image(self, filename: str, content: bytes) -> Dict[str, Any]:
+        """
+        Analyze image - OCR for text extraction
+        Note: Requires tesseract installed
+        """
+        try:
+            image = Image.open(io.BytesIO(content))
+
+            # Image metadata
+            width, height = image.size
+            mode = image.mode
+
+            # OCR text extraction (if tesseract available)
+            ocr_text = ""
+            try:
+                ocr_text = pytesseract.image_to_string(image, lang='deu')
+            except Exception as ocr_error:
+                ocr_text = f"[OCR not available: {str(ocr_error)}]"
+
+            return {
+                "filename": filename,
+                "type": "image",
+                "width": width,
+                "height": height,
+                "mode": mode,
+                "content": ocr_text,
+                "note": "Image als Diagramm/Skizze erkannt. OCR Text extrahiert.",
+                "success": True
+            }
+
+        except Exception as e:
+            return {
+                "filename": filename,
+                "type": "image",
+                "content": "",
+                "error": f"Image analysis failed: {str(e)}"
+            }
+
+    async def _analyze_docx(self, filename: str, content: bytes) -> Dict[str, Any]:
+        """Extract text from DOCX"""
+        try:
+            # Methode 1: python-docx
+            try:
+                doc = Document(io.BytesIO(content))
+                paragraphs = []
+                for para in doc.paragraphs:
+                    if para.text.strip():
+                        paragraphs.append(para.text)
+
+                text_content = "\n".join(paragraphs)
+
+            except:
+                # Methode 2: mammoth (bessere Formatierung)
+                result = mammoth.convert_to_text(io.BytesIO(content))
+                text_content = result.value
+
+            return {
+                "filename": filename,
+                "type": "docx",
+                "content": text_content,
+                "success": True
+            }
+
+        except Exception as e:
+            return {
+                "filename": filename,
+                "type": "docx",
+                "content": "",
+                "error": f"DOCX extraction failed: {str(e)}"
+            }
+
+    async def _analyze_text(self, filename: str, content: bytes) -> Dict[str, Any]:
+        """Extract text from plain text file"""
+        try:
+            text = content.decode('utf-8')
+
+            return {
+                "filename": filename,
+                "type": "text",
+                "content": text,
+                "success": True
+            }
+
+        except Exception as e:
+            return {
+                "filename": filename,
+                "type": "text",
+                "content": "",
+                "error": f"Text extraction failed: {str(e)}"
+            }
+
+    def extract_key_concepts(self, materials: list[Dict[str, Any]]) -> list[str]:
+        """
+        Extract key concepts from materials
+        Simple heuristic: Find capitalized words, frequent terms
+
+        In production: Use Claude AI for better concept extraction
+        """
+        all_text = " ".join([m.get("content", "") for m in materials])
+
+        # Simple extraction: Capitalized words (potential concepts)
+        import re
+        words = re.findall(r'\b[A-ZÄÖÜ][a-zäöüß]+\b', all_text)
+
+        # Count frequency
+        from collections import Counter
+        word_counts = Counter(words)
+
+        # Return top 20 concepts
+        concepts = [word for word, count in word_counts.most_common(20)]
+        return concepts