Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
174
backend-lehrer/alerts_agent/models/alert_item.py
Normal file
174
backend-lehrer/alerts_agent/models/alert_item.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""
|
||||
AlertItem Model.
|
||||
|
||||
Repräsentiert einen einzelnen Alert aus Google Alerts (RSS oder Email).
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
import hashlib
|
||||
import uuid
|
||||
|
||||
|
||||
class AlertSource(str, Enum):
|
||||
"""Quelle des Alerts."""
|
||||
GOOGLE_ALERTS_RSS = "google_alerts_rss"
|
||||
GOOGLE_ALERTS_EMAIL = "google_alerts_email"
|
||||
MANUAL = "manual"
|
||||
|
||||
|
||||
class AlertStatus(str, Enum):
|
||||
"""Verarbeitungsstatus des Alerts."""
|
||||
NEW = "new"
|
||||
PROCESSED = "processed"
|
||||
DUPLICATE = "duplicate"
|
||||
SCORED = "scored"
|
||||
REVIEWED = "reviewed"
|
||||
ARCHIVED = "archived"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlertItem:
|
||||
"""Ein einzelner Alert-Eintrag."""
|
||||
|
||||
# Identifikation
|
||||
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
||||
|
||||
# Quelle
|
||||
source: AlertSource = AlertSource.GOOGLE_ALERTS_RSS
|
||||
topic_label: str = "" # z.B. "Schulrecht Bayern"
|
||||
feed_url: Optional[str] = None
|
||||
|
||||
# Content
|
||||
title: str = ""
|
||||
url: str = ""
|
||||
snippet: str = ""
|
||||
article_text: Optional[str] = None
|
||||
|
||||
# Metadaten
|
||||
lang: str = "de"
|
||||
published_at: Optional[datetime] = None
|
||||
fetched_at: datetime = field(default_factory=datetime.utcnow)
|
||||
|
||||
# Deduplication
|
||||
canonical_url: Optional[str] = None
|
||||
url_hash: Optional[str] = None
|
||||
content_hash: Optional[str] = None # SimHash für fuzzy matching
|
||||
|
||||
# Verarbeitung
|
||||
status: AlertStatus = AlertStatus.NEW
|
||||
cluster_id: Optional[str] = None
|
||||
|
||||
# Relevanz (nach Scoring)
|
||||
relevance_score: Optional[float] = None # 0.0 - 1.0
|
||||
relevance_decision: Optional[str] = None # KEEP, DROP, REVIEW
|
||||
relevance_reasons: list = field(default_factory=list)
|
||||
relevance_summary: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Berechne Hashes nach Initialisierung."""
|
||||
if not self.url_hash and self.url:
|
||||
self.url_hash = self._compute_url_hash()
|
||||
if not self.canonical_url and self.url:
|
||||
self.canonical_url = self._normalize_url(self.url)
|
||||
|
||||
def _compute_url_hash(self) -> str:
|
||||
"""Berechne SHA256 Hash der URL."""
|
||||
normalized = self._normalize_url(self.url)
|
||||
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
"""Normalisiere URL für Deduplizierung."""
|
||||
# Entferne Tracking-Parameter
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
|
||||
# Google News Redirect auflösen
|
||||
if "news.google.com" in parsed.netloc and "/articles/" in parsed.path:
|
||||
# news.google.com URLs enthalten die echte URL base64-kodiert
|
||||
# Hier nur Basic-Handling - echte Auflösung komplexer
|
||||
pass
|
||||
|
||||
# Tracking-Parameter entfernen
|
||||
tracking_params = {
|
||||
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
||||
"fbclid", "gclid", "ref", "source"
|
||||
}
|
||||
|
||||
query_params = urllib.parse.parse_qs(parsed.query)
|
||||
cleaned_params = {k: v for k, v in query_params.items()
|
||||
if k.lower() not in tracking_params}
|
||||
|
||||
cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
|
||||
|
||||
# Rekonstruiere URL ohne Fragment
|
||||
normalized = urllib.parse.urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path.rstrip("/"),
|
||||
parsed.params,
|
||||
cleaned_query,
|
||||
"" # No fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
def compute_content_hash(self, text: Optional[str] = None) -> str:
|
||||
"""
|
||||
Berechne SimHash des Inhalts für Fuzzy-Matching.
|
||||
|
||||
SimHash erlaubt es, ähnliche Texte zu erkennen, auch wenn sie
|
||||
sich leicht unterscheiden (z.B. verschiedene Quellen zum selben Thema).
|
||||
"""
|
||||
from ..processing.dedup import compute_simhash
|
||||
|
||||
content = text or self.article_text or self.snippet or self.title
|
||||
if content:
|
||||
self.content_hash = compute_simhash(content)
|
||||
return self.content_hash or ""
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Konvertiere zu Dictionary für JSON/DB."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"source": self.source.value,
|
||||
"topic_label": self.topic_label,
|
||||
"feed_url": self.feed_url,
|
||||
"title": self.title,
|
||||
"url": self.url,
|
||||
"snippet": self.snippet,
|
||||
"article_text": self.article_text,
|
||||
"lang": self.lang,
|
||||
"published_at": self.published_at.isoformat() if self.published_at else None,
|
||||
"fetched_at": self.fetched_at.isoformat() if self.fetched_at else None,
|
||||
"canonical_url": self.canonical_url,
|
||||
"url_hash": self.url_hash,
|
||||
"content_hash": self.content_hash,
|
||||
"status": self.status.value,
|
||||
"cluster_id": self.cluster_id,
|
||||
"relevance_score": self.relevance_score,
|
||||
"relevance_decision": self.relevance_decision,
|
||||
"relevance_reasons": self.relevance_reasons,
|
||||
"relevance_summary": self.relevance_summary,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "AlertItem":
|
||||
"""Erstelle AlertItem aus Dictionary."""
|
||||
# Parse Enums
|
||||
if "source" in data and isinstance(data["source"], str):
|
||||
data["source"] = AlertSource(data["source"])
|
||||
if "status" in data and isinstance(data["status"], str):
|
||||
data["status"] = AlertStatus(data["status"])
|
||||
|
||||
# Parse Timestamps
|
||||
for field_name in ["published_at", "fetched_at"]:
|
||||
if field_name in data and isinstance(data[field_name], str):
|
||||
data[field_name] = datetime.fromisoformat(data[field_name])
|
||||
|
||||
return cls(**data)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"AlertItem(id={self.id[:8]}, title='{self.title[:50]}...', status={self.status.value})"
|
||||
Reference in New Issue
Block a user