This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

175 lines
5.8 KiB
Python

"""
AlertItem Model.
Repräsentiert einen einzelnen Alert aus Google Alerts (RSS oder Email).
"""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Optional
import hashlib
import uuid
class AlertSource(str, Enum):
"""Quelle des Alerts."""
GOOGLE_ALERTS_RSS = "google_alerts_rss"
GOOGLE_ALERTS_EMAIL = "google_alerts_email"
MANUAL = "manual"
class AlertStatus(str, Enum):
"""Verarbeitungsstatus des Alerts."""
NEW = "new"
PROCESSED = "processed"
DUPLICATE = "duplicate"
SCORED = "scored"
REVIEWED = "reviewed"
ARCHIVED = "archived"
@dataclass
class AlertItem:
"""Ein einzelner Alert-Eintrag."""
# Identifikation
id: str = field(default_factory=lambda: str(uuid.uuid4()))
# Quelle
source: AlertSource = AlertSource.GOOGLE_ALERTS_RSS
topic_label: str = "" # z.B. "Schulrecht Bayern"
feed_url: Optional[str] = None
# Content
title: str = ""
url: str = ""
snippet: str = ""
article_text: Optional[str] = None
# Metadaten
lang: str = "de"
published_at: Optional[datetime] = None
fetched_at: datetime = field(default_factory=datetime.utcnow)
# Deduplication
canonical_url: Optional[str] = None
url_hash: Optional[str] = None
content_hash: Optional[str] = None # SimHash für fuzzy matching
# Verarbeitung
status: AlertStatus = AlertStatus.NEW
cluster_id: Optional[str] = None
# Relevanz (nach Scoring)
relevance_score: Optional[float] = None # 0.0 - 1.0
relevance_decision: Optional[str] = None # KEEP, DROP, REVIEW
relevance_reasons: list = field(default_factory=list)
relevance_summary: Optional[str] = None
def __post_init__(self):
"""Berechne Hashes nach Initialisierung."""
if not self.url_hash and self.url:
self.url_hash = self._compute_url_hash()
if not self.canonical_url and self.url:
self.canonical_url = self._normalize_url(self.url)
def _compute_url_hash(self) -> str:
"""Berechne SHA256 Hash der URL."""
normalized = self._normalize_url(self.url)
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
def _normalize_url(self, url: str) -> str:
"""Normalisiere URL für Deduplizierung."""
# Entferne Tracking-Parameter
import urllib.parse
parsed = urllib.parse.urlparse(url)
# Google News Redirect auflösen
if "news.google.com" in parsed.netloc and "/articles/" in parsed.path:
# news.google.com URLs enthalten die echte URL base64-kodiert
# Hier nur Basic-Handling - echte Auflösung komplexer
pass
# Tracking-Parameter entfernen
tracking_params = {
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
"fbclid", "gclid", "ref", "source"
}
query_params = urllib.parse.parse_qs(parsed.query)
cleaned_params = {k: v for k, v in query_params.items()
if k.lower() not in tracking_params}
cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
# Rekonstruiere URL ohne Fragment
normalized = urllib.parse.urlunparse((
parsed.scheme,
parsed.netloc.lower(),
parsed.path.rstrip("/"),
parsed.params,
cleaned_query,
"" # No fragment
))
return normalized
def compute_content_hash(self, text: Optional[str] = None) -> str:
"""
Berechne SimHash des Inhalts für Fuzzy-Matching.
SimHash erlaubt es, ähnliche Texte zu erkennen, auch wenn sie
sich leicht unterscheiden (z.B. verschiedene Quellen zum selben Thema).
"""
from ..processing.dedup import compute_simhash
content = text or self.article_text or self.snippet or self.title
if content:
self.content_hash = compute_simhash(content)
return self.content_hash or ""
def to_dict(self) -> dict:
"""Konvertiere zu Dictionary für JSON/DB."""
return {
"id": self.id,
"source": self.source.value,
"topic_label": self.topic_label,
"feed_url": self.feed_url,
"title": self.title,
"url": self.url,
"snippet": self.snippet,
"article_text": self.article_text,
"lang": self.lang,
"published_at": self.published_at.isoformat() if self.published_at else None,
"fetched_at": self.fetched_at.isoformat() if self.fetched_at else None,
"canonical_url": self.canonical_url,
"url_hash": self.url_hash,
"content_hash": self.content_hash,
"status": self.status.value,
"cluster_id": self.cluster_id,
"relevance_score": self.relevance_score,
"relevance_decision": self.relevance_decision,
"relevance_reasons": self.relevance_reasons,
"relevance_summary": self.relevance_summary,
}
@classmethod
def from_dict(cls, data: dict) -> "AlertItem":
"""Erstelle AlertItem aus Dictionary."""
# Parse Enums
if "source" in data and isinstance(data["source"], str):
data["source"] = AlertSource(data["source"])
if "status" in data and isinstance(data["status"], str):
data["status"] = AlertStatus(data["status"])
# Parse Timestamps
for field_name in ["published_at", "fetched_at"]:
if field_name in data and isinstance(data[field_name], str):
data[field_name] = datetime.fromisoformat(data[field_name])
return cls(**data)
def __repr__(self) -> str:
return f"AlertItem(id={self.id[:8]}, title='{self.title[:50]}...', status={self.status.value})"