""" Repository für Alert Items (einzelne Alerts/Artikel). """ import hashlib import urllib.parse import uuid from datetime import datetime, timedelta from typing import Optional, List, Dict, Any from sqlalchemy.orm import Session as DBSession from sqlalchemy import or_, func from .models import ( AlertItemDB, AlertSourceEnum, AlertStatusEnum, RelevanceDecisionEnum ) class AlertItemRepository: """Repository für Alert Items (einzelne Alerts/Artikel).""" def __init__(self, db: DBSession): self.db = db # ==================== CREATE ==================== def create( self, topic_id: str, title: str, url: str, snippet: str = "", source: str = "google_alerts_rss", published_at: datetime = None, lang: str = "de", ) -> AlertItemDB: """Erstellt einen neuen Alert.""" url_hash = self._compute_url_hash(url) alert = AlertItemDB( id=str(uuid.uuid4()), topic_id=topic_id, title=title, url=url, snippet=snippet, source=AlertSourceEnum(source), published_at=published_at, lang=lang, url_hash=url_hash, canonical_url=self._normalize_url(url), ) self.db.add(alert) self.db.commit() self.db.refresh(alert) return alert def create_if_not_exists( self, topic_id: str, title: str, url: str, snippet: str = "", source: str = "google_alerts_rss", published_at: datetime = None, ) -> Optional[AlertItemDB]: """Erstellt einen Alert nur wenn URL noch nicht existiert.""" url_hash = self._compute_url_hash(url) existing = self.db.query(AlertItemDB).filter( AlertItemDB.url_hash == url_hash ).first() if existing: return None # Duplikat return self.create( topic_id=topic_id, title=title, url=url, snippet=snippet, source=source, published_at=published_at, ) # ==================== READ ==================== def get_by_id(self, alert_id: str) -> Optional[AlertItemDB]: """Holt einen Alert nach ID.""" return self.db.query(AlertItemDB).filter( AlertItemDB.id == alert_id ).first() def get_by_url_hash(self, url_hash: str) -> Optional[AlertItemDB]: """Holt einen Alert nach URL-Hash.""" return self.db.query(AlertItemDB).filter( AlertItemDB.url_hash == url_hash ).first() def get_inbox( self, user_id: str = None, topic_id: str = None, decision: str = None, status: str = None, limit: int = 50, offset: int = 0, ) -> List[AlertItemDB]: """ Holt Inbox-Items mit Filtern. Ohne decision werden KEEP und REVIEW angezeigt. """ query = self.db.query(AlertItemDB) if topic_id: query = query.filter(AlertItemDB.topic_id == topic_id) if decision: query = query.filter( AlertItemDB.relevance_decision == RelevanceDecisionEnum(decision) ) else: # Default: KEEP und REVIEW query = query.filter( or_( AlertItemDB.relevance_decision == RelevanceDecisionEnum.KEEP, AlertItemDB.relevance_decision == RelevanceDecisionEnum.REVIEW, AlertItemDB.relevance_decision.is_(None) ) ) if status: query = query.filter(AlertItemDB.status == AlertStatusEnum(status)) return query.order_by( AlertItemDB.relevance_score.desc().nullslast(), AlertItemDB.fetched_at.desc() ).offset(offset).limit(limit).all() def get_unscored( self, topic_id: str = None, limit: int = 100, ) -> List[AlertItemDB]: """Holt alle unbewerteten Alerts.""" query = self.db.query(AlertItemDB).filter( AlertItemDB.status == AlertStatusEnum.NEW ) if topic_id: query = query.filter(AlertItemDB.topic_id == topic_id) return query.order_by(AlertItemDB.fetched_at.desc()).limit(limit).all() def get_by_topic( self, topic_id: str, limit: int = 100, offset: int = 0, ) -> List[AlertItemDB]: """Holt alle Alerts eines Topics.""" return self.db.query(AlertItemDB).filter( AlertItemDB.topic_id == topic_id ).order_by( AlertItemDB.fetched_at.desc() ).offset(offset).limit(limit).all() def count_by_status(self, topic_id: str = None) -> Dict[str, int]: """Zählt Alerts nach Status.""" query = self.db.query( AlertItemDB.status, func.count(AlertItemDB.id).label('count') ) if topic_id: query = query.filter(AlertItemDB.topic_id == topic_id) results = query.group_by(AlertItemDB.status).all() return {r[0].value: r[1] for r in results} def count_by_decision(self, topic_id: str = None) -> Dict[str, int]: """Zählt Alerts nach Relevanz-Entscheidung.""" query = self.db.query( AlertItemDB.relevance_decision, func.count(AlertItemDB.id).label('count') ) if topic_id: query = query.filter(AlertItemDB.topic_id == topic_id) results = query.group_by(AlertItemDB.relevance_decision).all() return { (r[0].value if r[0] else "unscored"): r[1] for r in results } # ==================== UPDATE ==================== def update_scoring( self, alert_id: str, score: float, decision: str, reasons: List[str] = None, summary: str = None, model: str = None, ) -> Optional[AlertItemDB]: """Aktualisiert das Scoring eines Alerts.""" alert = self.get_by_id(alert_id) if not alert: return None alert.relevance_score = score alert.relevance_decision = RelevanceDecisionEnum(decision) alert.relevance_reasons = reasons or [] alert.relevance_summary = summary alert.scored_by_model = model alert.scored_at = datetime.utcnow() alert.status = AlertStatusEnum.SCORED alert.processed_at = datetime.utcnow() self.db.commit() self.db.refresh(alert) return alert def update_status( self, alert_id: str, status: str, ) -> Optional[AlertItemDB]: """Aktualisiert den Status eines Alerts.""" alert = self.get_by_id(alert_id) if not alert: return None alert.status = AlertStatusEnum(status) self.db.commit() self.db.refresh(alert) return alert def mark_reviewed( self, alert_id: str, is_relevant: bool, notes: str = None, tags: List[str] = None, ) -> Optional[AlertItemDB]: """Markiert einen Alert als reviewed mit Feedback.""" alert = self.get_by_id(alert_id) if not alert: return None alert.status = AlertStatusEnum.REVIEWED alert.user_marked_relevant = is_relevant if notes: alert.user_notes = notes if tags: alert.user_tags = tags self.db.commit() self.db.refresh(alert) return alert def archive(self, alert_id: str) -> Optional[AlertItemDB]: """Archiviert einen Alert.""" return self.update_status(alert_id, "archived") # ==================== DELETE ==================== def delete(self, alert_id: str) -> bool: """Löscht einen Alert.""" alert = self.get_by_id(alert_id) if not alert: return False self.db.delete(alert) self.db.commit() return True def delete_old(self, days: int = 90, topic_id: str = None) -> int: """Löscht alte archivierte Alerts.""" cutoff = datetime.utcnow() - timedelta(days=days) query = self.db.query(AlertItemDB).filter( AlertItemDB.status == AlertStatusEnum.ARCHIVED, AlertItemDB.fetched_at < cutoff, ) if topic_id: query = query.filter(AlertItemDB.topic_id == topic_id) count = query.delete() self.db.commit() return count # ==================== FOR RSS FETCHER ==================== def get_existing_urls(self, topic_id: str) -> set: """ Holt alle bekannten URL-Hashes für ein Topic. Wird vom RSS-Fetcher verwendet um Duplikate zu vermeiden. """ results = self.db.query(AlertItemDB.url_hash).filter( AlertItemDB.topic_id == topic_id ).all() return {r[0] for r in results if r[0]} def create_from_alert_item(self, alert_item, topic_id: str) -> AlertItemDB: """ Erstellt einen Alert aus einem AlertItem-Objekt vom RSS-Fetcher. Args: alert_item: AlertItem from rss_fetcher topic_id: Topic ID to associate with Returns: Created AlertItemDB instance """ return self.create( topic_id=topic_id, title=alert_item.title, url=alert_item.url, snippet=alert_item.snippet or "", source=alert_item.source.value if hasattr(alert_item.source, 'value') else str(alert_item.source), published_at=alert_item.published_at, ) # ==================== HELPER ==================== def _compute_url_hash(self, url: str) -> str: """Berechnet SHA256 Hash der normalisierten URL.""" normalized = self._normalize_url(url) return hashlib.sha256(normalized.encode()).hexdigest()[:16] def _normalize_url(self, url: str) -> str: """Normalisiert URL für Deduplizierung.""" parsed = urllib.parse.urlparse(url) # Tracking-Parameter entfernen tracking_params = { "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term", "fbclid", "gclid", "ref", "source" } query_params = urllib.parse.parse_qs(parsed.query) cleaned_params = {k: v for k, v in query_params.items() if k.lower() not in tracking_params} cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True) # Rekonstruiere URL ohne Fragment normalized = urllib.parse.urlunparse(( parsed.scheme, parsed.netloc.lower(), parsed.path.rstrip("/"), parsed.params, cleaned_query, "" # No fragment )) return normalized # ==================== CONVERSION ==================== def to_dict(self, alert: AlertItemDB) -> Dict[str, Any]: """Konvertiert DB-Model zu Dictionary.""" return { "id": alert.id, "topic_id": alert.topic_id, "title": alert.title, "url": alert.url, "snippet": alert.snippet, "source": alert.source.value, "lang": alert.lang, "published_at": alert.published_at.isoformat() if alert.published_at else None, "fetched_at": alert.fetched_at.isoformat() if alert.fetched_at else None, "status": alert.status.value, "relevance": { "score": alert.relevance_score, "decision": alert.relevance_decision.value if alert.relevance_decision else None, "reasons": alert.relevance_reasons, "summary": alert.relevance_summary, "model": alert.scored_by_model, "scored_at": alert.scored_at.isoformat() if alert.scored_at else None, }, "user_feedback": { "marked_relevant": alert.user_marked_relevant, "tags": alert.user_tags, "notes": alert.user_notes, }, }