A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
175 lines
5.8 KiB
Python
175 lines
5.8 KiB
Python
"""
|
|
AlertItem Model.
|
|
|
|
Repräsentiert einen einzelnen Alert aus Google Alerts (RSS oder Email).
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Optional
|
|
import hashlib
|
|
import uuid
|
|
|
|
|
|
class AlertSource(str, Enum):
|
|
"""Quelle des Alerts."""
|
|
GOOGLE_ALERTS_RSS = "google_alerts_rss"
|
|
GOOGLE_ALERTS_EMAIL = "google_alerts_email"
|
|
MANUAL = "manual"
|
|
|
|
|
|
class AlertStatus(str, Enum):
|
|
"""Verarbeitungsstatus des Alerts."""
|
|
NEW = "new"
|
|
PROCESSED = "processed"
|
|
DUPLICATE = "duplicate"
|
|
SCORED = "scored"
|
|
REVIEWED = "reviewed"
|
|
ARCHIVED = "archived"
|
|
|
|
|
|
@dataclass
|
|
class AlertItem:
|
|
"""Ein einzelner Alert-Eintrag."""
|
|
|
|
# Identifikation
|
|
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
|
|
# Quelle
|
|
source: AlertSource = AlertSource.GOOGLE_ALERTS_RSS
|
|
topic_label: str = "" # z.B. "Schulrecht Bayern"
|
|
feed_url: Optional[str] = None
|
|
|
|
# Content
|
|
title: str = ""
|
|
url: str = ""
|
|
snippet: str = ""
|
|
article_text: Optional[str] = None
|
|
|
|
# Metadaten
|
|
lang: str = "de"
|
|
published_at: Optional[datetime] = None
|
|
fetched_at: datetime = field(default_factory=datetime.utcnow)
|
|
|
|
# Deduplication
|
|
canonical_url: Optional[str] = None
|
|
url_hash: Optional[str] = None
|
|
content_hash: Optional[str] = None # SimHash für fuzzy matching
|
|
|
|
# Verarbeitung
|
|
status: AlertStatus = AlertStatus.NEW
|
|
cluster_id: Optional[str] = None
|
|
|
|
# Relevanz (nach Scoring)
|
|
relevance_score: Optional[float] = None # 0.0 - 1.0
|
|
relevance_decision: Optional[str] = None # KEEP, DROP, REVIEW
|
|
relevance_reasons: list = field(default_factory=list)
|
|
relevance_summary: Optional[str] = None
|
|
|
|
def __post_init__(self):
|
|
"""Berechne Hashes nach Initialisierung."""
|
|
if not self.url_hash and self.url:
|
|
self.url_hash = self._compute_url_hash()
|
|
if not self.canonical_url and self.url:
|
|
self.canonical_url = self._normalize_url(self.url)
|
|
|
|
def _compute_url_hash(self) -> str:
|
|
"""Berechne SHA256 Hash der URL."""
|
|
normalized = self._normalize_url(self.url)
|
|
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
|
|
|
def _normalize_url(self, url: str) -> str:
|
|
"""Normalisiere URL für Deduplizierung."""
|
|
# Entferne Tracking-Parameter
|
|
import urllib.parse
|
|
parsed = urllib.parse.urlparse(url)
|
|
|
|
# Google News Redirect auflösen
|
|
if "news.google.com" in parsed.netloc and "/articles/" in parsed.path:
|
|
# news.google.com URLs enthalten die echte URL base64-kodiert
|
|
# Hier nur Basic-Handling - echte Auflösung komplexer
|
|
pass
|
|
|
|
# Tracking-Parameter entfernen
|
|
tracking_params = {
|
|
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
|
"fbclid", "gclid", "ref", "source"
|
|
}
|
|
|
|
query_params = urllib.parse.parse_qs(parsed.query)
|
|
cleaned_params = {k: v for k, v in query_params.items()
|
|
if k.lower() not in tracking_params}
|
|
|
|
cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
|
|
|
|
# Rekonstruiere URL ohne Fragment
|
|
normalized = urllib.parse.urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc.lower(),
|
|
parsed.path.rstrip("/"),
|
|
parsed.params,
|
|
cleaned_query,
|
|
"" # No fragment
|
|
))
|
|
|
|
return normalized
|
|
|
|
def compute_content_hash(self, text: Optional[str] = None) -> str:
|
|
"""
|
|
Berechne SimHash des Inhalts für Fuzzy-Matching.
|
|
|
|
SimHash erlaubt es, ähnliche Texte zu erkennen, auch wenn sie
|
|
sich leicht unterscheiden (z.B. verschiedene Quellen zum selben Thema).
|
|
"""
|
|
from ..processing.dedup import compute_simhash
|
|
|
|
content = text or self.article_text or self.snippet or self.title
|
|
if content:
|
|
self.content_hash = compute_simhash(content)
|
|
return self.content_hash or ""
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Konvertiere zu Dictionary für JSON/DB."""
|
|
return {
|
|
"id": self.id,
|
|
"source": self.source.value,
|
|
"topic_label": self.topic_label,
|
|
"feed_url": self.feed_url,
|
|
"title": self.title,
|
|
"url": self.url,
|
|
"snippet": self.snippet,
|
|
"article_text": self.article_text,
|
|
"lang": self.lang,
|
|
"published_at": self.published_at.isoformat() if self.published_at else None,
|
|
"fetched_at": self.fetched_at.isoformat() if self.fetched_at else None,
|
|
"canonical_url": self.canonical_url,
|
|
"url_hash": self.url_hash,
|
|
"content_hash": self.content_hash,
|
|
"status": self.status.value,
|
|
"cluster_id": self.cluster_id,
|
|
"relevance_score": self.relevance_score,
|
|
"relevance_decision": self.relevance_decision,
|
|
"relevance_reasons": self.relevance_reasons,
|
|
"relevance_summary": self.relevance_summary,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> "AlertItem":
|
|
"""Erstelle AlertItem aus Dictionary."""
|
|
# Parse Enums
|
|
if "source" in data and isinstance(data["source"], str):
|
|
data["source"] = AlertSource(data["source"])
|
|
if "status" in data and isinstance(data["status"], str):
|
|
data["status"] = AlertStatus(data["status"])
|
|
|
|
# Parse Timestamps
|
|
for field_name in ["published_at", "fetched_at"]:
|
|
if field_name in data and isinstance(data[field_name], str):
|
|
data[field_name] = datetime.fromisoformat(data[field_name])
|
|
|
|
return cls(**data)
|
|
|
|
def __repr__(self) -> str:
|
|
return f"AlertItem(id={self.id[:8]}, title='{self.title[:50]}...', status={self.status.value})"
|