""" AlertItem Model. Repräsentiert einen einzelnen Alert aus Google Alerts (RSS oder Email). """ from dataclasses import dataclass, field from datetime import datetime from enum import Enum from typing import Optional import hashlib import uuid class AlertSource(str, Enum): """Quelle des Alerts.""" GOOGLE_ALERTS_RSS = "google_alerts_rss" GOOGLE_ALERTS_EMAIL = "google_alerts_email" MANUAL = "manual" class AlertStatus(str, Enum): """Verarbeitungsstatus des Alerts.""" NEW = "new" PROCESSED = "processed" DUPLICATE = "duplicate" SCORED = "scored" REVIEWED = "reviewed" ARCHIVED = "archived" @dataclass class AlertItem: """Ein einzelner Alert-Eintrag.""" # Identifikation id: str = field(default_factory=lambda: str(uuid.uuid4())) # Quelle source: AlertSource = AlertSource.GOOGLE_ALERTS_RSS topic_label: str = "" # z.B. "Schulrecht Bayern" feed_url: Optional[str] = None # Content title: str = "" url: str = "" snippet: str = "" article_text: Optional[str] = None # Metadaten lang: str = "de" published_at: Optional[datetime] = None fetched_at: datetime = field(default_factory=datetime.utcnow) # Deduplication canonical_url: Optional[str] = None url_hash: Optional[str] = None content_hash: Optional[str] = None # SimHash für fuzzy matching # Verarbeitung status: AlertStatus = AlertStatus.NEW cluster_id: Optional[str] = None # Relevanz (nach Scoring) relevance_score: Optional[float] = None # 0.0 - 1.0 relevance_decision: Optional[str] = None # KEEP, DROP, REVIEW relevance_reasons: list = field(default_factory=list) relevance_summary: Optional[str] = None def __post_init__(self): """Berechne Hashes nach Initialisierung.""" if not self.url_hash and self.url: self.url_hash = self._compute_url_hash() if not self.canonical_url and self.url: self.canonical_url = self._normalize_url(self.url) def _compute_url_hash(self) -> str: """Berechne SHA256 Hash der URL.""" normalized = self._normalize_url(self.url) return hashlib.sha256(normalized.encode()).hexdigest()[:16] def _normalize_url(self, url: str) -> str: """Normalisiere URL für Deduplizierung.""" # Entferne Tracking-Parameter import urllib.parse parsed = urllib.parse.urlparse(url) # Google News Redirect auflösen if "news.google.com" in parsed.netloc and "/articles/" in parsed.path: # news.google.com URLs enthalten die echte URL base64-kodiert # Hier nur Basic-Handling - echte Auflösung komplexer pass # Tracking-Parameter entfernen tracking_params = { "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term", "fbclid", "gclid", "ref", "source" } query_params = urllib.parse.parse_qs(parsed.query) cleaned_params = {k: v for k, v in query_params.items() if k.lower() not in tracking_params} cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True) # Rekonstruiere URL ohne Fragment normalized = urllib.parse.urlunparse(( parsed.scheme, parsed.netloc.lower(), parsed.path.rstrip("/"), parsed.params, cleaned_query, "" # No fragment )) return normalized def compute_content_hash(self, text: Optional[str] = None) -> str: """ Berechne SimHash des Inhalts für Fuzzy-Matching. SimHash erlaubt es, ähnliche Texte zu erkennen, auch wenn sie sich leicht unterscheiden (z.B. verschiedene Quellen zum selben Thema). """ from ..processing.dedup import compute_simhash content = text or self.article_text or self.snippet or self.title if content: self.content_hash = compute_simhash(content) return self.content_hash or "" def to_dict(self) -> dict: """Konvertiere zu Dictionary für JSON/DB.""" return { "id": self.id, "source": self.source.value, "topic_label": self.topic_label, "feed_url": self.feed_url, "title": self.title, "url": self.url, "snippet": self.snippet, "article_text": self.article_text, "lang": self.lang, "published_at": self.published_at.isoformat() if self.published_at else None, "fetched_at": self.fetched_at.isoformat() if self.fetched_at else None, "canonical_url": self.canonical_url, "url_hash": self.url_hash, "content_hash": self.content_hash, "status": self.status.value, "cluster_id": self.cluster_id, "relevance_score": self.relevance_score, "relevance_decision": self.relevance_decision, "relevance_reasons": self.relevance_reasons, "relevance_summary": self.relevance_summary, } @classmethod def from_dict(cls, data: dict) -> "AlertItem": """Erstelle AlertItem aus Dictionary.""" # Parse Enums if "source" in data and isinstance(data["source"], str): data["source"] = AlertSource(data["source"]) if "status" in data and isinstance(data["status"], str): data["status"] = AlertStatus(data["status"]) # Parse Timestamps for field_name in ["published_at", "fetched_at"]: if field_name in data and isinstance(data[field_name], str): data[field_name] = datetime.fromisoformat(data[field_name]) return cls(**data) def __repr__(self) -> str: return f"AlertItem(id={self.id[:8]}, title='{self.title[:50]}...', status={self.status.value})"