fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
8
backend/alerts_agent/ingestion/__init__.py
Normal file
8
backend/alerts_agent/ingestion/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""Alert Ingestion Modules."""
|
||||
|
||||
from .rss_fetcher import RSSFetcher, FeedConfig
|
||||
|
||||
__all__ = [
|
||||
"RSSFetcher",
|
||||
"FeedConfig",
|
||||
]
|
||||
356
backend/alerts_agent/ingestion/email_parser.py
Normal file
356
backend/alerts_agent/ingestion/email_parser.py
Normal file
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
Email Parser für Google Alerts.
|
||||
|
||||
Parst Google Alert E-Mails und extrahiert Alert-Items.
|
||||
|
||||
Google Alert E-Mail Format:
|
||||
- Subject: Google Alert - <Suchbegriff>
|
||||
- Body enthält HTML mit Links zu Artikeln
|
||||
- Jeder Artikel hat: Titel, URL, Snippet, Quelle
|
||||
"""
|
||||
import re
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Dict, Any
|
||||
from html import unescape
|
||||
from urllib.parse import urlparse, parse_qs, unquote
|
||||
from email import message_from_bytes, message_from_string
|
||||
from email.message import EmailMessage
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedAlertEmail:
|
||||
"""Ergebnis eines geparsten Google Alert E-Mails."""
|
||||
search_term: str
|
||||
items: List[Dict[str, Any]]
|
||||
received_at: datetime
|
||||
message_id: Optional[str] = None
|
||||
|
||||
|
||||
def extract_real_url(google_redirect_url: str) -> str:
|
||||
"""
|
||||
Extrahiert die echte URL aus einem Google Redirect-Link.
|
||||
|
||||
Google Alert Links haben das Format:
|
||||
https://www.google.com/url?rct=j&sa=t&url=<ENCODED_URL>&...
|
||||
|
||||
Args:
|
||||
google_redirect_url: Google Redirect URL
|
||||
|
||||
Returns:
|
||||
Echte Ziel-URL
|
||||
"""
|
||||
if "google.com/url" in google_redirect_url:
|
||||
parsed = urlparse(google_redirect_url)
|
||||
params = parse_qs(parsed.query)
|
||||
|
||||
if "url" in params:
|
||||
return unquote(params["url"][0])
|
||||
|
||||
return google_redirect_url
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Bereinigt Text von HTML-Entities und überschüssigem Whitespace."""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# HTML-Entities dekodieren
|
||||
text = unescape(text)
|
||||
|
||||
# Überschüssigen Whitespace entfernen
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def parse_google_alert_html(html_content: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parst den HTML-Body einer Google Alert E-Mail.
|
||||
|
||||
Args:
|
||||
html_content: HTML-Inhalt der E-Mail
|
||||
|
||||
Returns:
|
||||
Liste von Alert-Items mit title, url, snippet, source
|
||||
"""
|
||||
items = []
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Google Alerts verwendet verschiedene Formate
|
||||
# Format 1: Tabellen-basiert (älteres Format)
|
||||
for table in soup.find_all('table'):
|
||||
# Suche nach Links in der Tabelle
|
||||
for link in table.find_all('a', href=True):
|
||||
href = link.get('href', '')
|
||||
|
||||
# Nur Google-Redirect-Links (echte Alert-Links)
|
||||
if 'google.com/url' not in href:
|
||||
continue
|
||||
|
||||
real_url = extract_real_url(href)
|
||||
|
||||
# Titel ist der Link-Text
|
||||
title = clean_text(link.get_text())
|
||||
if not title or len(title) < 5:
|
||||
continue
|
||||
|
||||
# Snippet: Text nach dem Link in der gleichen Zelle
|
||||
parent = link.find_parent('td') or link.find_parent('div')
|
||||
snippet = ""
|
||||
if parent:
|
||||
# Text nach dem Link extrahieren
|
||||
full_text = clean_text(parent.get_text())
|
||||
if title in full_text:
|
||||
snippet = full_text.replace(title, '').strip()
|
||||
# Ersten 300 Zeichen als Snippet
|
||||
snippet = snippet[:300]
|
||||
|
||||
# Quelle extrahieren (Domain)
|
||||
source_domain = urlparse(real_url).netloc
|
||||
|
||||
items.append({
|
||||
"title": title,
|
||||
"url": real_url,
|
||||
"snippet": snippet,
|
||||
"source": source_domain,
|
||||
})
|
||||
|
||||
# Format 2: Div-basiert (neueres Format)
|
||||
if not items:
|
||||
for div in soup.find_all('div', class_=re.compile(r'.*')):
|
||||
for link in div.find_all('a', href=True):
|
||||
href = link.get('href', '')
|
||||
|
||||
if 'google.com/url' not in href:
|
||||
continue
|
||||
|
||||
real_url = extract_real_url(href)
|
||||
title = clean_text(link.get_text())
|
||||
|
||||
if not title or len(title) < 5:
|
||||
continue
|
||||
|
||||
# Duplikate vermeiden
|
||||
if any(i['url'] == real_url for i in items):
|
||||
continue
|
||||
|
||||
source_domain = urlparse(real_url).netloc
|
||||
|
||||
items.append({
|
||||
"title": title,
|
||||
"url": real_url,
|
||||
"snippet": "",
|
||||
"source": source_domain,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing Google Alert HTML: {e}")
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def parse_email_message(
|
||||
email_bytes: bytes = None,
|
||||
email_string: str = None,
|
||||
) -> Optional[ParsedAlertEmail]:
|
||||
"""
|
||||
Parst eine E-Mail-Nachricht (Google Alert Format).
|
||||
|
||||
Args:
|
||||
email_bytes: Raw E-Mail als Bytes
|
||||
email_string: E-Mail als String
|
||||
|
||||
Returns:
|
||||
ParsedAlertEmail oder None bei Fehler
|
||||
"""
|
||||
try:
|
||||
if email_bytes:
|
||||
msg = message_from_bytes(email_bytes)
|
||||
elif email_string:
|
||||
msg = message_from_string(email_string)
|
||||
else:
|
||||
return None
|
||||
|
||||
# Prüfen ob es eine Google Alert E-Mail ist
|
||||
subject = msg.get('Subject', '')
|
||||
if 'Google Alert' not in subject:
|
||||
logger.debug(f"Not a Google Alert email: {subject}")
|
||||
return None
|
||||
|
||||
# Suchbegriff aus Subject extrahieren
|
||||
# Format: "Google Alert - <Suchbegriff>"
|
||||
search_term = ""
|
||||
if ' - ' in subject:
|
||||
search_term = subject.split(' - ', 1)[1].strip()
|
||||
|
||||
# Message-ID
|
||||
message_id = msg.get('Message-ID', '')
|
||||
|
||||
# Empfangsdatum
|
||||
date_str = msg.get('Date', '')
|
||||
received_at = datetime.utcnow() # Fallback
|
||||
|
||||
# HTML-Body extrahieren
|
||||
html_content = ""
|
||||
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
content_type = part.get_content_type()
|
||||
if content_type == 'text/html':
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = part.get_content_charset() or 'utf-8'
|
||||
html_content = payload.decode(charset, errors='replace')
|
||||
break
|
||||
else:
|
||||
content_type = msg.get_content_type()
|
||||
if content_type == 'text/html':
|
||||
payload = msg.get_payload(decode=True)
|
||||
if payload:
|
||||
charset = msg.get_content_charset() or 'utf-8'
|
||||
html_content = payload.decode(charset, errors='replace')
|
||||
|
||||
if not html_content:
|
||||
logger.warning(f"No HTML content in Google Alert email: {subject}")
|
||||
return None
|
||||
|
||||
# HTML parsen
|
||||
items = parse_google_alert_html(html_content)
|
||||
|
||||
return ParsedAlertEmail(
|
||||
search_term=search_term,
|
||||
items=items,
|
||||
received_at=received_at,
|
||||
message_id=message_id,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing email message: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def process_alert_emails(
|
||||
emails: List[bytes],
|
||||
topic_id: str,
|
||||
db,
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
Verarbeitet eine Liste von Google Alert E-Mails und speichert Items in DB.
|
||||
|
||||
Args:
|
||||
emails: Liste von E-Mails als Bytes
|
||||
topic_id: ID des zugehörigen Topics
|
||||
db: SQLAlchemy Session
|
||||
|
||||
Returns:
|
||||
Dict mit new_items und duplicates_skipped
|
||||
"""
|
||||
from alerts_agent.db.repository import AlertItemRepository
|
||||
from alerts_agent.db.models import AlertSourceEnum
|
||||
|
||||
repo = AlertItemRepository(db)
|
||||
|
||||
total_new = 0
|
||||
total_skipped = 0
|
||||
|
||||
for email_bytes in emails:
|
||||
parsed = parse_email_message(email_bytes=email_bytes)
|
||||
|
||||
if not parsed:
|
||||
continue
|
||||
|
||||
for item in parsed.items:
|
||||
alert = repo.create_if_not_exists(
|
||||
topic_id=topic_id,
|
||||
title=item["title"],
|
||||
url=item["url"],
|
||||
snippet=item.get("snippet", ""),
|
||||
source=AlertSourceEnum.GOOGLE_ALERTS_EMAIL,
|
||||
)
|
||||
|
||||
if alert:
|
||||
total_new += 1
|
||||
else:
|
||||
total_skipped += 1
|
||||
|
||||
return {
|
||||
"new_items": total_new,
|
||||
"duplicates_skipped": total_skipped,
|
||||
}
|
||||
|
||||
|
||||
# IMAP-Integration für automatisches E-Mail-Fetching
|
||||
async def fetch_emails_from_imap(
|
||||
host: str,
|
||||
username: str,
|
||||
password: str,
|
||||
folder: str = "INBOX",
|
||||
search_criteria: str = 'FROM "googlealerts-noreply@google.com" UNSEEN',
|
||||
limit: int = 100,
|
||||
) -> List[bytes]:
|
||||
"""
|
||||
Holt E-Mails von einem IMAP-Server.
|
||||
|
||||
Args:
|
||||
host: IMAP-Server Hostname
|
||||
username: IMAP-Benutzername
|
||||
password: IMAP-Passwort
|
||||
folder: IMAP-Ordner (default: INBOX)
|
||||
search_criteria: IMAP-Suchkriterien
|
||||
limit: Maximale Anzahl E-Mails
|
||||
|
||||
Returns:
|
||||
Liste von E-Mails als Bytes
|
||||
"""
|
||||
try:
|
||||
import aioimaplib
|
||||
except ImportError:
|
||||
logger.error("aioimaplib not installed. Run: pip install aioimaplib")
|
||||
return []
|
||||
|
||||
emails = []
|
||||
|
||||
try:
|
||||
# IMAP-Verbindung
|
||||
client = aioimaplib.IMAP4_SSL(host)
|
||||
await client.wait_hello_from_server()
|
||||
|
||||
# Login
|
||||
await client.login(username, password)
|
||||
|
||||
# Ordner auswählen
|
||||
await client.select(folder)
|
||||
|
||||
# E-Mails suchen
|
||||
result, data = await client.search(search_criteria)
|
||||
|
||||
if result != 'OK':
|
||||
logger.error(f"IMAP search failed: {result}")
|
||||
return []
|
||||
|
||||
# Message-IDs extrahieren
|
||||
message_ids = data[0].split()[-limit:] # Letzte N E-Mails
|
||||
|
||||
# E-Mails abrufen
|
||||
for msg_id in message_ids:
|
||||
result, data = await client.fetch(msg_id, '(RFC822)')
|
||||
if result == 'OK' and data:
|
||||
# data ist eine Liste von Tupeln
|
||||
for item in data:
|
||||
if isinstance(item, tuple) and len(item) >= 2:
|
||||
emails.append(item[1])
|
||||
|
||||
# Logout
|
||||
await client.logout()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"IMAP fetch error: {e}")
|
||||
|
||||
return emails
|
||||
383
backend/alerts_agent/ingestion/rss_fetcher.py
Normal file
383
backend/alerts_agent/ingestion/rss_fetcher.py
Normal file
@@ -0,0 +1,383 @@
|
||||
"""
|
||||
RSS Fetcher für Google Alerts.
|
||||
|
||||
Liest Google Alerts RSS Feeds und konvertiert sie in AlertItems.
|
||||
|
||||
Google Alerts RSS Feed Format:
|
||||
- Feed URL: https://google.com/alerts/feeds/<user_id>/<alert_id>
|
||||
- Entries enthalten: title, link, published, content
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from html import unescape
|
||||
import re
|
||||
|
||||
import httpx
|
||||
|
||||
try:
|
||||
import feedparser
|
||||
FEEDPARSER_AVAILABLE = True
|
||||
except ImportError:
|
||||
FEEDPARSER_AVAILABLE = False
|
||||
|
||||
from ..models.alert_item import AlertItem, AlertSource, AlertStatus
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FeedConfig:
|
||||
"""Konfiguration für einen RSS Feed."""
|
||||
url: str
|
||||
topic_label: str # z.B. "Inklusion Bayern"
|
||||
enabled: bool = True
|
||||
fetch_interval_minutes: int = 60
|
||||
last_fetched: Optional[datetime] = None
|
||||
last_entry_id: Optional[str] = None # Für Duplikat-Erkennung
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchResult:
|
||||
"""Ergebnis eines Feed-Fetches."""
|
||||
feed_url: str
|
||||
success: bool
|
||||
items: list = field(default_factory=list) # List[AlertItem]
|
||||
error: Optional[str] = None
|
||||
fetched_at: datetime = field(default_factory=datetime.utcnow)
|
||||
new_items_count: int = 0
|
||||
skipped_count: int = 0 # Bereits bekannte Items
|
||||
|
||||
|
||||
class RSSFetcher:
|
||||
"""
|
||||
Fetcher für Google Alerts RSS Feeds.
|
||||
|
||||
Usage:
|
||||
fetcher = RSSFetcher()
|
||||
fetcher.add_feed("https://google.com/alerts/feeds/...", "Inklusion")
|
||||
results = await fetcher.fetch_all()
|
||||
"""
|
||||
|
||||
def __init__(self, timeout: int = 30, user_agent: str = "BreakPilot-AlertAgent/0.1"):
|
||||
"""
|
||||
Initialisiere RSSFetcher.
|
||||
|
||||
Args:
|
||||
timeout: HTTP Timeout in Sekunden
|
||||
user_agent: User-Agent Header
|
||||
"""
|
||||
if not FEEDPARSER_AVAILABLE:
|
||||
raise ImportError(
|
||||
"feedparser ist nicht installiert. "
|
||||
"Installiere mit: pip install feedparser"
|
||||
)
|
||||
|
||||
self.feeds: list[FeedConfig] = []
|
||||
self.timeout = timeout
|
||||
self.user_agent = user_agent
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
def add_feed(self, url: str, topic_label: str, **kwargs) -> None:
|
||||
"""Füge einen Feed hinzu."""
|
||||
config = FeedConfig(url=url, topic_label=topic_label, **kwargs)
|
||||
self.feeds.append(config)
|
||||
logger.info(f"Feed hinzugefügt: {topic_label} ({url[:50]}...)")
|
||||
|
||||
def remove_feed(self, url: str) -> bool:
|
||||
"""Entferne einen Feed."""
|
||||
before = len(self.feeds)
|
||||
self.feeds = [f for f in self.feeds if f.url != url]
|
||||
return len(self.feeds) < before
|
||||
|
||||
async def _get_client(self) -> httpx.AsyncClient:
|
||||
"""Hole oder erstelle HTTP Client."""
|
||||
if self._client is None or self._client.is_closed:
|
||||
self._client = httpx.AsyncClient(
|
||||
timeout=self.timeout,
|
||||
headers={"User-Agent": self.user_agent},
|
||||
follow_redirects=True,
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Schließe HTTP Client."""
|
||||
if self._client:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
async def fetch_feed(self, config: FeedConfig,
|
||||
known_entry_ids: Optional[set] = None) -> FetchResult:
|
||||
"""
|
||||
Fetch einen einzelnen Feed.
|
||||
|
||||
Args:
|
||||
config: Feed-Konfiguration
|
||||
known_entry_ids: Optional Set von bereits bekannten Entry-IDs
|
||||
|
||||
Returns:
|
||||
FetchResult mit AlertItems
|
||||
"""
|
||||
result = FetchResult(feed_url=config.url, success=False)
|
||||
known_ids = known_entry_ids or set()
|
||||
|
||||
try:
|
||||
client = await self._get_client()
|
||||
response = await client.get(config.url)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse Feed
|
||||
feed = feedparser.parse(response.text)
|
||||
|
||||
if feed.bozo and feed.bozo_exception:
|
||||
# Feed hatte Parsing-Fehler, aber möglicherweise noch nutzbar
|
||||
logger.warning(f"Feed {config.topic_label}: Parsing-Warnung: {feed.bozo_exception}")
|
||||
|
||||
if not feed.entries:
|
||||
logger.info(f"Feed {config.topic_label}: Keine Einträge")
|
||||
result.success = True
|
||||
return result
|
||||
|
||||
items = []
|
||||
for entry in feed.entries:
|
||||
# Entry-ID für Duplikat-Check
|
||||
entry_id = entry.get("id") or entry.get("link") or entry.get("title")
|
||||
|
||||
if entry_id in known_ids:
|
||||
result.skipped_count += 1
|
||||
continue
|
||||
|
||||
# Konvertiere zu AlertItem
|
||||
alert = self._entry_to_alert(entry, config)
|
||||
if alert:
|
||||
items.append(alert)
|
||||
result.new_items_count += 1
|
||||
|
||||
result.items = items
|
||||
result.success = True
|
||||
config.last_fetched = datetime.utcnow()
|
||||
|
||||
logger.info(
|
||||
f"Feed {config.topic_label}: {result.new_items_count} neue, "
|
||||
f"{result.skipped_count} übersprungen"
|
||||
)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
result.error = f"HTTP {e.response.status_code}: {e.response.reason_phrase}"
|
||||
logger.error(f"Feed {config.topic_label}: {result.error}")
|
||||
|
||||
except httpx.RequestError as e:
|
||||
result.error = f"Request failed: {str(e)}"
|
||||
logger.error(f"Feed {config.topic_label}: {result.error}")
|
||||
|
||||
except Exception as e:
|
||||
result.error = f"Unexpected error: {str(e)}"
|
||||
logger.exception(f"Feed {config.topic_label}: Unerwarteter Fehler")
|
||||
|
||||
return result
|
||||
|
||||
def _entry_to_alert(self, entry: dict, config: FeedConfig) -> Optional[AlertItem]:
|
||||
"""
|
||||
Konvertiere feedparser Entry zu AlertItem.
|
||||
|
||||
Google Alerts Entry Format:
|
||||
- title: Titel mit HTML-Entities
|
||||
- link: URL zum Artikel
|
||||
- published_parsed: Datum als struct_time
|
||||
- content[0].value: HTML Content mit Snippet
|
||||
"""
|
||||
try:
|
||||
# Title bereinigen
|
||||
title = unescape(entry.get("title", ""))
|
||||
title = self._clean_html(title)
|
||||
|
||||
# URL extrahieren
|
||||
url = entry.get("link", "")
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Snippet aus Content extrahieren
|
||||
snippet = ""
|
||||
if "content" in entry and entry["content"]:
|
||||
content_html = entry["content"][0].get("value", "")
|
||||
snippet = self._clean_html(content_html)
|
||||
elif "summary" in entry:
|
||||
snippet = self._clean_html(entry["summary"])
|
||||
|
||||
# Datum parsen
|
||||
published_at = None
|
||||
if "published_parsed" in entry and entry["published_parsed"]:
|
||||
try:
|
||||
published_at = datetime(*entry["published_parsed"][:6])
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# AlertItem erstellen
|
||||
alert = AlertItem(
|
||||
source=AlertSource.GOOGLE_ALERTS_RSS,
|
||||
topic_label=config.topic_label,
|
||||
feed_url=config.url,
|
||||
title=title,
|
||||
url=url,
|
||||
snippet=snippet[:2000], # Limit snippet length
|
||||
published_at=published_at,
|
||||
status=AlertStatus.NEW,
|
||||
)
|
||||
|
||||
return alert
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Entry konnte nicht konvertiert werden: {e}")
|
||||
return None
|
||||
|
||||
def _clean_html(self, html: str) -> str:
|
||||
"""Entferne HTML Tags und bereinige Text."""
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
# HTML-Entities dekodieren
|
||||
text = unescape(html)
|
||||
|
||||
# HTML Tags entfernen
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
|
||||
# Whitespace normalisieren
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
async def fetch_all(self, known_entry_ids: Optional[set] = None,
|
||||
parallel: bool = True) -> list[FetchResult]:
|
||||
"""
|
||||
Fetch alle konfigurierten Feeds.
|
||||
|
||||
Args:
|
||||
known_entry_ids: Set von bekannten Entry-IDs (global)
|
||||
parallel: Wenn True, fetche parallel
|
||||
|
||||
Returns:
|
||||
Liste von FetchResults
|
||||
"""
|
||||
active_feeds = [f for f in self.feeds if f.enabled]
|
||||
|
||||
if not active_feeds:
|
||||
logger.warning("Keine aktiven Feeds konfiguriert")
|
||||
return []
|
||||
|
||||
logger.info(f"Fetche {len(active_feeds)} Feeds...")
|
||||
|
||||
if parallel:
|
||||
tasks = [
|
||||
self.fetch_feed(config, known_entry_ids)
|
||||
for config in active_feeds
|
||||
]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Exceptions in FetchResults konvertieren
|
||||
processed = []
|
||||
for i, result in enumerate(results):
|
||||
if isinstance(result, Exception):
|
||||
processed.append(FetchResult(
|
||||
feed_url=active_feeds[i].url,
|
||||
success=False,
|
||||
error=str(result)
|
||||
))
|
||||
else:
|
||||
processed.append(result)
|
||||
return processed
|
||||
else:
|
||||
results = []
|
||||
for config in active_feeds:
|
||||
result = await self.fetch_feed(config, known_entry_ids)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
def get_all_items(self, results: list[FetchResult]) -> list[AlertItem]:
|
||||
"""Extrahiere alle AlertItems aus FetchResults."""
|
||||
items = []
|
||||
for result in results:
|
||||
if result.success:
|
||||
items.extend(result.items)
|
||||
return items
|
||||
|
||||
def get_stats(self, results: list[FetchResult]) -> dict:
|
||||
"""Generiere Statistiken über Fetch-Ergebnisse."""
|
||||
total_new = sum(r.new_items_count for r in results)
|
||||
total_skipped = sum(r.skipped_count for r in results)
|
||||
successful = sum(1 for r in results if r.success)
|
||||
failed = sum(1 for r in results if not r.success)
|
||||
|
||||
return {
|
||||
"feeds_total": len(results),
|
||||
"feeds_successful": successful,
|
||||
"feeds_failed": failed,
|
||||
"items_new": total_new,
|
||||
"items_skipped": total_skipped,
|
||||
"errors": [r.error for r in results if r.error],
|
||||
}
|
||||
|
||||
|
||||
async def fetch_and_store_feed(
|
||||
topic_id: str,
|
||||
feed_url: str,
|
||||
db,
|
||||
) -> dict:
|
||||
"""
|
||||
Convenience function to fetch a single feed and store results.
|
||||
|
||||
This is the function used by the API to trigger manual fetches.
|
||||
|
||||
Args:
|
||||
topic_id: The topic ID to associate with fetched items
|
||||
feed_url: The RSS feed URL to fetch
|
||||
db: Database session for storing results
|
||||
|
||||
Returns:
|
||||
dict with new_items and duplicates_skipped counts
|
||||
"""
|
||||
from ..db.repository import AlertItemRepository, TopicRepository
|
||||
|
||||
if not FEEDPARSER_AVAILABLE:
|
||||
raise ImportError("feedparser ist nicht installiert")
|
||||
|
||||
fetcher = RSSFetcher()
|
||||
fetcher.add_feed(feed_url, topic_label=topic_id)
|
||||
|
||||
# Get known entry IDs to skip duplicates
|
||||
alert_repo = AlertItemRepository(db)
|
||||
existing_urls = alert_repo.get_existing_urls(topic_id)
|
||||
|
||||
# Fetch the feed
|
||||
results = await fetcher.fetch_all(known_entry_ids=existing_urls)
|
||||
await fetcher.close()
|
||||
|
||||
if not results:
|
||||
return {"new_items": 0, "duplicates_skipped": 0}
|
||||
|
||||
result = results[0]
|
||||
|
||||
if not result.success:
|
||||
raise Exception(result.error or "Feed fetch failed")
|
||||
|
||||
# Store new items
|
||||
new_count = 0
|
||||
for item in result.items:
|
||||
alert_repo.create_from_alert_item(item, topic_id)
|
||||
new_count += 1
|
||||
|
||||
# Update topic stats
|
||||
topic_repo = TopicRepository(db)
|
||||
topic_repo.update_fetch_status(
|
||||
topic_id,
|
||||
last_fetch_error=None,
|
||||
items_fetched=new_count,
|
||||
)
|
||||
|
||||
return {
|
||||
"new_items": new_count,
|
||||
"duplicates_skipped": result.skipped_count,
|
||||
}
|
||||
279
backend/alerts_agent/ingestion/scheduler.py
Normal file
279
backend/alerts_agent/ingestion/scheduler.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""
|
||||
Scheduler für automatisches Feed-Fetching.
|
||||
|
||||
Verwendet APScheduler für periodische Jobs basierend auf Topic-Konfiguration.
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from apscheduler.jobstores.memory import MemoryJobStore
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from alerts_agent.db.database import SessionLocal
|
||||
from alerts_agent.db.repository import TopicRepository
|
||||
from alerts_agent.ingestion.rss_fetcher import fetch_and_store_feed
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Globaler Scheduler (Singleton)
|
||||
_scheduler: Optional[AsyncIOScheduler] = None
|
||||
|
||||
|
||||
def get_scheduler() -> AsyncIOScheduler:
|
||||
"""Gibt den globalen Scheduler zurück, erstellt ihn bei Bedarf."""
|
||||
global _scheduler
|
||||
if _scheduler is None:
|
||||
_scheduler = AsyncIOScheduler(
|
||||
jobstores={"default": MemoryJobStore()},
|
||||
job_defaults={
|
||||
"coalesce": True, # Verpasste Jobs zusammenfassen
|
||||
"max_instances": 1, # Nur eine Instanz pro Job
|
||||
"misfire_grace_time": 60, # 60s Toleranz für verpasste Jobs
|
||||
},
|
||||
)
|
||||
return _scheduler
|
||||
|
||||
|
||||
async def fetch_topic_job(topic_id: str, feed_url: str) -> None:
|
||||
"""
|
||||
Job-Funktion für das Fetchen eines einzelnen Topics.
|
||||
|
||||
Wird vom Scheduler aufgerufen.
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
logger.info(f"Scheduler: Fetching topic {topic_id}")
|
||||
result = await fetch_and_store_feed(
|
||||
topic_id=topic_id,
|
||||
feed_url=feed_url,
|
||||
db=db,
|
||||
)
|
||||
logger.info(
|
||||
f"Scheduler: Topic {topic_id} - {result['new_items']} new, "
|
||||
f"{result['duplicates_skipped']} skipped"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Scheduler: Error fetching topic {topic_id}: {e}")
|
||||
# Fehler im Topic speichern
|
||||
repo = TopicRepository(db)
|
||||
repo.update(topic_id, last_fetch_error=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def schedule_topic(
|
||||
topic_id: str,
|
||||
feed_url: str,
|
||||
interval_minutes: int = 60,
|
||||
) -> str:
|
||||
"""
|
||||
Plant einen periodischen Fetch-Job für ein Topic.
|
||||
|
||||
Args:
|
||||
topic_id: ID des Topics
|
||||
feed_url: URL des RSS-Feeds
|
||||
interval_minutes: Intervall in Minuten
|
||||
|
||||
Returns:
|
||||
Job-ID für spätere Referenz
|
||||
"""
|
||||
scheduler = get_scheduler()
|
||||
job_id = f"fetch_topic_{topic_id}"
|
||||
|
||||
# Existierenden Job entfernen falls vorhanden
|
||||
if scheduler.get_job(job_id):
|
||||
scheduler.remove_job(job_id)
|
||||
|
||||
# Neuen Job hinzufügen
|
||||
scheduler.add_job(
|
||||
fetch_topic_job,
|
||||
trigger=IntervalTrigger(minutes=interval_minutes),
|
||||
id=job_id,
|
||||
name=f"Fetch Topic {topic_id}",
|
||||
kwargs={"topic_id": topic_id, "feed_url": feed_url},
|
||||
replace_existing=True,
|
||||
)
|
||||
|
||||
logger.info(f"Scheduled topic {topic_id} every {interval_minutes} minutes")
|
||||
return job_id
|
||||
|
||||
|
||||
def unschedule_topic(topic_id: str) -> bool:
|
||||
"""
|
||||
Entfernt den Fetch-Job für ein Topic.
|
||||
|
||||
Args:
|
||||
topic_id: ID des Topics
|
||||
|
||||
Returns:
|
||||
True wenn Job entfernt wurde, False wenn nicht gefunden
|
||||
"""
|
||||
scheduler = get_scheduler()
|
||||
job_id = f"fetch_topic_{topic_id}"
|
||||
|
||||
if scheduler.get_job(job_id):
|
||||
scheduler.remove_job(job_id)
|
||||
logger.info(f"Unscheduled topic {topic_id}")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def reschedule_topic(
|
||||
topic_id: str,
|
||||
feed_url: str,
|
||||
interval_minutes: int,
|
||||
) -> str:
|
||||
"""
|
||||
Aktualisiert das Intervall für einen Topic-Job.
|
||||
|
||||
Args:
|
||||
topic_id: ID des Topics
|
||||
feed_url: URL des RSS-Feeds (falls geändert)
|
||||
interval_minutes: Neues Intervall
|
||||
|
||||
Returns:
|
||||
Job-ID
|
||||
"""
|
||||
return schedule_topic(topic_id, feed_url, interval_minutes)
|
||||
|
||||
|
||||
def sync_scheduler_with_db() -> dict:
|
||||
"""
|
||||
Synchronisiert den Scheduler mit der Datenbank.
|
||||
|
||||
Lädt alle aktiven Topics und plant/entfernt Jobs entsprechend.
|
||||
|
||||
Returns:
|
||||
Dict mit scheduled und unscheduled Counts
|
||||
"""
|
||||
db = SessionLocal()
|
||||
scheduler = get_scheduler()
|
||||
|
||||
try:
|
||||
repo = TopicRepository(db)
|
||||
topics = repo.get_all()
|
||||
|
||||
scheduled = 0
|
||||
unscheduled = 0
|
||||
|
||||
# Aktuelle Job-IDs sammeln
|
||||
expected_job_ids = set()
|
||||
|
||||
for topic in topics:
|
||||
job_id = f"fetch_topic_{topic.id}"
|
||||
|
||||
if topic.is_active and topic.feed_url:
|
||||
# Topic sollte geplant sein
|
||||
expected_job_ids.add(job_id)
|
||||
schedule_topic(
|
||||
topic_id=topic.id,
|
||||
feed_url=topic.feed_url,
|
||||
interval_minutes=topic.fetch_interval_minutes,
|
||||
)
|
||||
scheduled += 1
|
||||
else:
|
||||
# Topic sollte nicht geplant sein
|
||||
if scheduler.get_job(job_id):
|
||||
scheduler.remove_job(job_id)
|
||||
unscheduled += 1
|
||||
|
||||
# Orphan-Jobs entfernen (Topics die gelöscht wurden)
|
||||
for job in scheduler.get_jobs():
|
||||
if job.id.startswith("fetch_topic_") and job.id not in expected_job_ids:
|
||||
scheduler.remove_job(job.id)
|
||||
unscheduled += 1
|
||||
logger.info(f"Removed orphan job: {job.id}")
|
||||
|
||||
return {"scheduled": scheduled, "unscheduled": unscheduled}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def start_scheduler() -> None:
|
||||
"""
|
||||
Startet den Scheduler.
|
||||
|
||||
Sollte beim App-Start aufgerufen werden.
|
||||
"""
|
||||
scheduler = get_scheduler()
|
||||
|
||||
if not scheduler.running:
|
||||
scheduler.start()
|
||||
logger.info("Alert scheduler started")
|
||||
|
||||
# Initial mit DB synchronisieren
|
||||
result = sync_scheduler_with_db()
|
||||
logger.info(
|
||||
f"Scheduler synced: {result['scheduled']} topics scheduled, "
|
||||
f"{result['unscheduled']} removed"
|
||||
)
|
||||
|
||||
|
||||
def stop_scheduler() -> None:
|
||||
"""
|
||||
Stoppt den Scheduler.
|
||||
|
||||
Sollte beim App-Shutdown aufgerufen werden.
|
||||
"""
|
||||
scheduler = get_scheduler()
|
||||
|
||||
if scheduler.running:
|
||||
scheduler.shutdown(wait=False)
|
||||
logger.info("Alert scheduler stopped")
|
||||
|
||||
|
||||
def get_scheduler_status() -> dict:
|
||||
"""
|
||||
Gibt den Status des Schedulers zurück.
|
||||
|
||||
Returns:
|
||||
Dict mit running, jobs_count und job_details
|
||||
"""
|
||||
scheduler = get_scheduler()
|
||||
|
||||
jobs = []
|
||||
for job in scheduler.get_jobs():
|
||||
jobs.append({
|
||||
"id": job.id,
|
||||
"name": job.name,
|
||||
"next_run": job.next_run_time.isoformat() if job.next_run_time else None,
|
||||
"trigger": str(job.trigger),
|
||||
})
|
||||
|
||||
return {
|
||||
"running": scheduler.running,
|
||||
"jobs_count": len(jobs),
|
||||
"jobs": jobs,
|
||||
}
|
||||
|
||||
|
||||
# Convenience-Funktion für Topic-Aktivierung
|
||||
async def on_topic_activated(topic_id: str, feed_url: str, interval_minutes: int) -> None:
|
||||
"""Hook für Topic-Aktivierung - plant den Fetch-Job."""
|
||||
schedule_topic(topic_id, feed_url, interval_minutes)
|
||||
|
||||
|
||||
async def on_topic_deactivated(topic_id: str) -> None:
|
||||
"""Hook für Topic-Deaktivierung - entfernt den Fetch-Job."""
|
||||
unschedule_topic(topic_id)
|
||||
|
||||
|
||||
async def on_topic_updated(
|
||||
topic_id: str,
|
||||
feed_url: str,
|
||||
interval_minutes: int,
|
||||
is_active: bool,
|
||||
) -> None:
|
||||
"""Hook für Topic-Update - aktualisiert oder entfernt den Fetch-Job."""
|
||||
if is_active and feed_url:
|
||||
reschedule_topic(topic_id, feed_url, interval_minutes)
|
||||
else:
|
||||
unschedule_topic(topic_id)
|
||||
|
||||
|
||||
async def on_topic_deleted(topic_id: str) -> None:
|
||||
"""Hook für Topic-Löschung - entfernt den Fetch-Job."""
|
||||
unschedule_topic(topic_id)
|
||||
Reference in New Issue
Block a user