This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

198 lines
6.1 KiB
Python

"""
Material Analyzer
Analysiert hochgeladene Lernmaterialien (PDF, Images, DOCX)
"""
from typing import Dict, Any, Optional
import io
from PyPDF2 import PdfReader
from PIL import Image
import pytesseract
from docx import Document
import mammoth
class MaterialAnalyzer:
"""Analyzer für verschiedene Material-Typen"""
async def analyze(self, filename: str, content: bytes) -> Dict[str, Any]:
"""
Analyze uploaded material
Args:
filename: Name der Datei
content: Datei-Content als bytes
Returns:
Strukturierte Material-Daten
"""
file_ext = filename.lower().split('.')[-1]
try:
if file_ext == 'pdf':
return await self._analyze_pdf(filename, content)
elif file_ext in ['png', 'jpg', 'jpeg']:
return await self._analyze_image(filename, content)
elif file_ext == 'docx':
return await self._analyze_docx(filename, content)
elif file_ext == 'txt':
return await self._analyze_text(filename, content)
else:
return {
"filename": filename,
"type": "unknown",
"content": "",
"error": f"Unsupported file type: {file_ext}"
}
except Exception as e:
return {
"filename": filename,
"type": "error",
"content": "",
"error": str(e)
}
async def _analyze_pdf(self, filename: str, content: bytes) -> Dict[str, Any]:
"""Extract text from PDF"""
try:
pdf_file = io.BytesIO(content)
reader = PdfReader(pdf_file)
text_content = []
num_pages = len(reader.pages)
for page_num, page in enumerate(reader.pages, 1):
text = page.extract_text()
if text.strip():
text_content.append(f"--- Seite {page_num} ---")
text_content.append(text)
return {
"filename": filename,
"type": "pdf",
"num_pages": num_pages,
"content": "\n".join(text_content),
"success": True
}
except Exception as e:
return {
"filename": filename,
"type": "pdf",
"content": "",
"error": f"PDF extraction failed: {str(e)}"
}
async def _analyze_image(self, filename: str, content: bytes) -> Dict[str, Any]:
"""
Analyze image - OCR for text extraction
Note: Requires tesseract installed
"""
try:
image = Image.open(io.BytesIO(content))
# Image metadata
width, height = image.size
mode = image.mode
# OCR text extraction (if tesseract available)
ocr_text = ""
try:
ocr_text = pytesseract.image_to_string(image, lang='deu')
except Exception as ocr_error:
ocr_text = f"[OCR not available: {str(ocr_error)}]"
return {
"filename": filename,
"type": "image",
"width": width,
"height": height,
"mode": mode,
"content": ocr_text,
"note": "Image als Diagramm/Skizze erkannt. OCR Text extrahiert.",
"success": True
}
except Exception as e:
return {
"filename": filename,
"type": "image",
"content": "",
"error": f"Image analysis failed: {str(e)}"
}
async def _analyze_docx(self, filename: str, content: bytes) -> Dict[str, Any]:
"""Extract text from DOCX"""
try:
# Methode 1: python-docx
try:
doc = Document(io.BytesIO(content))
paragraphs = []
for para in doc.paragraphs:
if para.text.strip():
paragraphs.append(para.text)
text_content = "\n".join(paragraphs)
except:
# Methode 2: mammoth (bessere Formatierung)
result = mammoth.convert_to_text(io.BytesIO(content))
text_content = result.value
return {
"filename": filename,
"type": "docx",
"content": text_content,
"success": True
}
except Exception as e:
return {
"filename": filename,
"type": "docx",
"content": "",
"error": f"DOCX extraction failed: {str(e)}"
}
async def _analyze_text(self, filename: str, content: bytes) -> Dict[str, Any]:
"""Extract text from plain text file"""
try:
text = content.decode('utf-8')
return {
"filename": filename,
"type": "text",
"content": text,
"success": True
}
except Exception as e:
return {
"filename": filename,
"type": "text",
"content": "",
"error": f"Text extraction failed: {str(e)}"
}
def extract_key_concepts(self, materials: list[Dict[str, Any]]) -> list[str]:
"""
Extract key concepts from materials
Simple heuristic: Find capitalized words, frequent terms
In production: Use Claude AI for better concept extraction
"""
all_text = " ".join([m.get("content", "") for m in materials])
# Simple extraction: Capitalized words (potential concepts)
import re
words = re.findall(r'\b[A-ZÄÖÜ][a-zäöüß]+\b', all_text)
# Count frequency
from collections import Counter
word_counts = Counter(words)
# Return top 20 concepts
concepts = [word for word, count in word_counts.most_common(20)]
return concepts