This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/ai-content-generator/app/services/youtube_service.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

244 lines
7.2 KiB
Python

"""
YouTube Service
Video-Suche und Transkript-Analyse für Interactive Video Content
"""
import os
from typing import List, Dict, Any, Optional
from youtube_transcript_api import YouTubeTranscriptApi
import re
class YouTubeService:
"""YouTube Integration Service"""
def __init__(self):
# Kein API Key nötig für Transcript API
# Für Video-Suche würden wir YouTube Data API brauchen (optional)
self.youtube_api_key = os.getenv("YOUTUBE_API_KEY")
def is_configured(self) -> bool:
"""Check if YouTube API is configured"""
# Transcript API funktioniert ohne Key
return True
async def search_videos(
self,
query: str,
max_results: int = 5
) -> List[Dict[str, Any]]:
"""
Search YouTube videos
NOTE: Ohne YouTube Data API Key nutzen wir eine Fallback-Methode
oder können später die API integrieren.
"""
# TODO: YouTube Data API Integration
# Für jetzt: Fallback mit bekannten Educational Channels
# Beispiel: Terra X, SimpleClub, MrWissen2go etc.
# In Production würde hier die YouTube Data API search.list verwendet
return [
{
"video_id": "EXAMPLE_VIDEO_ID",
"title": f"Video zum Thema: {query}",
"channel": "Educational Channel",
"url": f"https://www.youtube.com/watch?v=EXAMPLE_VIDEO_ID",
"has_transcript": False,
"note": "Use real YouTube Data API in production"
}
]
async def get_video_transcript(
self,
video_id: str,
languages: List[str] = ["de", "en"]
) -> Optional[Dict[str, Any]]:
"""
Get video transcript
Args:
video_id: YouTube video ID
languages: Preferred languages (default: German, English)
Returns:
Transcript data with timestamps
"""
try:
# Hole Transkript
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Versuche bevorzugte Sprachen
transcript = None
for lang in languages:
try:
transcript = transcript_list.find_transcript([lang])
break
except:
continue
# Falls keine bevorzugte Sprache, nimm die erste verfügbare
if not transcript:
transcript = transcript_list.find_transcript(
transcript_list._manually_created_transcripts.keys()
)
# Hole Transcript-Daten
transcript_data = transcript.fetch()
return {
"video_id": video_id,
"language": transcript.language_code,
"is_generated": transcript.is_generated,
"transcript": transcript_data
}
except Exception as e:
print(f"Error fetching transcript for {video_id}: {e}")
return None
def extract_key_moments(
self,
transcript_data: List[Dict[str, Any]],
num_moments: int = 5
) -> List[Dict[str, Any]]:
"""
Extract key moments from transcript
Einfache Heuristik: Nimm Momente gleichmäßig verteilt
In einer verbesserten Version: Nutze Claude AI um wichtige Momente zu identifizieren
"""
if not transcript_data:
return []
total_duration = transcript_data[-1]['start'] + transcript_data[-1]['duration']
interval = total_duration / (num_moments + 1)
key_moments = []
for i in range(1, num_moments + 1):
target_time = interval * i
# Finde nächsten Transcript-Eintrag
closest_entry = min(
transcript_data,
key=lambda x: abs(x['start'] - target_time)
)
key_moments.append({
"time": self._format_timestamp(closest_entry['start']),
"seconds": closest_entry['start'],
"text": closest_entry['text']
})
return key_moments
async def generate_video_interactions_with_claude(
self,
video_id: str,
topic: str,
transcript_data: List[Dict[str, Any]],
claude_service: Any,
num_interactions: int = 5
) -> List[Dict[str, Any]]:
"""
Generate interactive elements for video using Claude AI
Args:
video_id: YouTube video ID
topic: Video topic
transcript_data: Full transcript
claude_service: Claude service instance
num_interactions: Number of interactions to generate
Returns:
List of interactions with timestamps
"""
# Erstelle Transkript-Text
full_text = self._create_transcript_text(transcript_data)
prompt = f"""Analysiere dieses Video-Transkript zum Thema "{topic}" und identifiziere {num_interactions} wichtige Momente für interaktive Elemente.
Transkript:
{full_text[:8000]} # Limit für Token-Effizienz
Für jeden Moment, erstelle:
1. Einen Zeitstempel (in Sekunden)
2. Einen Interaktionstyp (question, info, oder link)
3. Einen Titel
4. Den Inhalt (Frage, Information, oder URL)
Formatiere als JSON-Array:
[
{{
"seconds": 45,
"type": "question",
"title": "Verständnisfrage",
"content": "Was ist die Hauptfunktion...?"
}},
{{
"seconds": 120,
"type": "info",
"title": "Wichtiger Hinweis",
"content": "Beachte dass..."
}}
]
Wähle Momente die:
- Wichtige Konzepte einführen
- Verständnis testen
- Zusatzinformationen bieten
Nur JSON zurückgeben."""
response = await claude_service.generate_content(
prompt=prompt,
system_prompt="Du bist ein Experte für interaktive Video-Didaktik."
)
# Parse JSON
import json
import re
json_match = re.search(r'\[.*\]', response, re.DOTALL)
if json_match:
interactions = json.loads(json_match.group())
# Konvertiere Sekunden zu mm:ss Format
for interaction in interactions:
interaction['time'] = self._format_timestamp(interaction['seconds'])
return interactions
return []
def _create_transcript_text(self, transcript_data: List[Dict[str, Any]]) -> str:
"""Create readable text from transcript"""
lines = []
for entry in transcript_data:
timestamp = self._format_timestamp(entry['start'])
lines.append(f"[{timestamp}] {entry['text']}")
return "\n".join(lines)
def _format_timestamp(self, seconds: float) -> str:
"""Format seconds to mm:ss"""
minutes = int(seconds // 60)
secs = int(seconds % 60)
return f"{minutes:02d}:{secs:02d}"
def extract_video_id_from_url(self, url: str) -> Optional[str]:
"""Extract video ID from YouTube URL"""
patterns = [
r'(?:youtube\.com\/watch\?v=)([^&]+)',
r'(?:youtu\.be\/)([^?]+)',
r'(?:youtube\.com\/embed\/)([^?]+)'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None