Initial commit: breakpilot-core - Shared Infrastructure

Docker Compose with 24+ services: - PostgreSQL (PostGIS), Valkey, MinIO, Qdrant - Vault (PKI/TLS), Nginx (Reverse Proxy) - Backend Core API, Consent Service, Billing Service - RAG Service, Embedding Service - Gitea, Woodpecker CI/CD - Night Scheduler, Health Aggregator - Jitsi (Web/XMPP/JVB/Jicofo), Mailpit Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:13 +01:00
commit ad111d5e69
244 changed files with 84288 additions and 0 deletions
@@ -0,0 +1,12 @@
+"""Alert Agent Models."""
+
+from .alert_item import AlertItem, AlertSource, AlertStatus
+from .relevance_profile import RelevanceProfile, PriorityItem
+
+__all__ = [
+    "AlertItem",
+    "AlertSource",
+    "AlertStatus",
+    "RelevanceProfile",
+    "PriorityItem",
+]
@@ -0,0 +1,174 @@
+"""
+AlertItem Model.
+
+Repräsentiert einen einzelnen Alert aus Google Alerts (RSS oder Email).
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Optional
+import hashlib
+import uuid
+
+
+class AlertSource(str, Enum):
+    """Quelle des Alerts."""
+    GOOGLE_ALERTS_RSS = "google_alerts_rss"
+    GOOGLE_ALERTS_EMAIL = "google_alerts_email"
+    MANUAL = "manual"
+
+
+class AlertStatus(str, Enum):
+    """Verarbeitungsstatus des Alerts."""
+    NEW = "new"
+    PROCESSED = "processed"
+    DUPLICATE = "duplicate"
+    SCORED = "scored"
+    REVIEWED = "reviewed"
+    ARCHIVED = "archived"
+
+
+@dataclass
+class AlertItem:
+    """Ein einzelner Alert-Eintrag."""
+
+    # Identifikation
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+    # Quelle
+    source: AlertSource = AlertSource.GOOGLE_ALERTS_RSS
+    topic_label: str = ""  # z.B. "Schulrecht Bayern"
+    feed_url: Optional[str] = None
+
+    # Content
+    title: str = ""
+    url: str = ""
+    snippet: str = ""
+    article_text: Optional[str] = None
+
+    # Metadaten
+    lang: str = "de"
+    published_at: Optional[datetime] = None
+    fetched_at: datetime = field(default_factory=datetime.utcnow)
+
+    # Deduplication
+    canonical_url: Optional[str] = None
+    url_hash: Optional[str] = None
+    content_hash: Optional[str] = None  # SimHash für fuzzy matching
+
+    # Verarbeitung
+    status: AlertStatus = AlertStatus.NEW
+    cluster_id: Optional[str] = None
+
+    # Relevanz (nach Scoring)
+    relevance_score: Optional[float] = None  # 0.0 - 1.0
+    relevance_decision: Optional[str] = None  # KEEP, DROP, REVIEW
+    relevance_reasons: list = field(default_factory=list)
+    relevance_summary: Optional[str] = None
+
+    def __post_init__(self):
+        """Berechne Hashes nach Initialisierung."""
+        if not self.url_hash and self.url:
+            self.url_hash = self._compute_url_hash()
+        if not self.canonical_url and self.url:
+            self.canonical_url = self._normalize_url(self.url)
+
+    def _compute_url_hash(self) -> str:
+        """Berechne SHA256 Hash der URL."""
+        normalized = self._normalize_url(self.url)
+        return hashlib.sha256(normalized.encode()).hexdigest()[:16]
+
+    def _normalize_url(self, url: str) -> str:
+        """Normalisiere URL für Deduplizierung."""
+        # Entferne Tracking-Parameter
+        import urllib.parse
+        parsed = urllib.parse.urlparse(url)
+
+        # Google News Redirect auflösen
+        if "news.google.com" in parsed.netloc and "/articles/" in parsed.path:
+            # news.google.com URLs enthalten die echte URL base64-kodiert
+            # Hier nur Basic-Handling - echte Auflösung komplexer
+            pass
+
+        # Tracking-Parameter entfernen
+        tracking_params = {
+            "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
+            "fbclid", "gclid", "ref", "source"
+        }
+
+        query_params = urllib.parse.parse_qs(parsed.query)
+        cleaned_params = {k: v for k, v in query_params.items()
+                         if k.lower() not in tracking_params}
+
+        cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
+
+        # Rekonstruiere URL ohne Fragment
+        normalized = urllib.parse.urlunparse((
+            parsed.scheme,
+            parsed.netloc.lower(),
+            parsed.path.rstrip("/"),
+            parsed.params,
+            cleaned_query,
+            ""  # No fragment
+        ))
+
+        return normalized
+
+    def compute_content_hash(self, text: Optional[str] = None) -> str:
+        """
+        Berechne SimHash des Inhalts für Fuzzy-Matching.
+
+        SimHash erlaubt es, ähnliche Texte zu erkennen, auch wenn sie
+        sich leicht unterscheiden (z.B. verschiedene Quellen zum selben Thema).
+        """
+        from ..processing.dedup import compute_simhash
+
+        content = text or self.article_text or self.snippet or self.title
+        if content:
+            self.content_hash = compute_simhash(content)
+        return self.content_hash or ""
+
+    def to_dict(self) -> dict:
+        """Konvertiere zu Dictionary für JSON/DB."""
+        return {
+            "id": self.id,
+            "source": self.source.value,
+            "topic_label": self.topic_label,
+            "feed_url": self.feed_url,
+            "title": self.title,
+            "url": self.url,
+            "snippet": self.snippet,
+            "article_text": self.article_text,
+            "lang": self.lang,
+            "published_at": self.published_at.isoformat() if self.published_at else None,
+            "fetched_at": self.fetched_at.isoformat() if self.fetched_at else None,
+            "canonical_url": self.canonical_url,
+            "url_hash": self.url_hash,
+            "content_hash": self.content_hash,
+            "status": self.status.value,
+            "cluster_id": self.cluster_id,
+            "relevance_score": self.relevance_score,
+            "relevance_decision": self.relevance_decision,
+            "relevance_reasons": self.relevance_reasons,
+            "relevance_summary": self.relevance_summary,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "AlertItem":
+        """Erstelle AlertItem aus Dictionary."""
+        # Parse Enums
+        if "source" in data and isinstance(data["source"], str):
+            data["source"] = AlertSource(data["source"])
+        if "status" in data and isinstance(data["status"], str):
+            data["status"] = AlertStatus(data["status"])
+
+        # Parse Timestamps
+        for field_name in ["published_at", "fetched_at"]:
+            if field_name in data and isinstance(data[field_name], str):
+                data[field_name] = datetime.fromisoformat(data[field_name])
+
+        return cls(**data)
+
+    def __repr__(self) -> str:
+        return f"AlertItem(id={self.id[:8]}, title='{self.title[:50]}...', status={self.status.value})"
@@ -0,0 +1,288 @@
+"""
+RelevanceProfile Model.
+
+Definiert das Relevanzprofil eines Nutzers für die Alerts-Filterung.
+Lernt über Zeit durch Feedback.
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Optional
+import uuid
+
+
+@dataclass
+class PriorityItem:
+    """Ein Prioritäts-Thema im Profil."""
+    label: str  # z.B. "Inklusion", "Datenschutz Schule"
+    weight: float = 0.5  # 0.0 - 1.0, höher = wichtiger
+    keywords: list = field(default_factory=list)  # Zusätzliche Keywords
+    description: Optional[str] = None  # Kontext für LLM
+
+    def to_dict(self) -> dict:
+        return {
+            "label": self.label,
+            "weight": self.weight,
+            "keywords": self.keywords,
+            "description": self.description,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "PriorityItem":
+        return cls(**data)
+
+
+@dataclass
+class RelevanceProfile:
+    """
+    Nutzerprofil für Relevanz-Scoring.
+
+    Das Profil wird verwendet, um Alerts auf Relevanz zu prüfen.
+    Es enthält:
+    - Prioritäten: Themen die wichtig sind (mit Gewichtung)
+    - Ausschlüsse: Themen die ignoriert werden sollen
+    - Positive Beispiele: URLs/Titel die relevant waren
+    - Negative Beispiele: URLs/Titel die irrelevant waren
+    - Policies: Zusätzliche Regeln (z.B. nur deutsche Quellen)
+    """
+
+    # Identifikation
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    user_id: Optional[str] = None  # Falls benutzerspezifisch
+
+    # Relevanz-Kriterien
+    priorities: list = field(default_factory=list)  # List[PriorityItem]
+    exclusions: list = field(default_factory=list)  # Keywords zum Ausschließen
+
+    # Beispiele für Few-Shot Learning
+    positive_examples: list = field(default_factory=list)  # Relevante Alerts
+    negative_examples: list = field(default_factory=list)  # Irrelevante Alerts
+
+    # Policies
+    policies: dict = field(default_factory=dict)
+
+    # Metadaten
+    created_at: datetime = field(default_factory=datetime.utcnow)
+    updated_at: datetime = field(default_factory=datetime.utcnow)
+
+    # Statistiken
+    total_scored: int = 0
+    total_kept: int = 0
+    total_dropped: int = 0
+    accuracy_estimate: Optional[float] = None  # Geschätzte Genauigkeit
+
+    def add_priority(self, label: str, weight: float = 0.5, **kwargs) -> None:
+        """Füge ein Prioritäts-Thema hinzu."""
+        self.priorities.append(PriorityItem(
+            label=label,
+            weight=weight,
+            **kwargs
+        ))
+        self.updated_at = datetime.utcnow()
+
+    def add_exclusion(self, keyword: str) -> None:
+        """Füge ein Ausschluss-Keyword hinzu."""
+        if keyword not in self.exclusions:
+            self.exclusions.append(keyword)
+            self.updated_at = datetime.utcnow()
+
+    def add_positive_example(self, title: str, url: str, reason: str = "") -> None:
+        """Füge ein positives Beispiel hinzu (für Few-Shot Learning)."""
+        self.positive_examples.append({
+            "title": title,
+            "url": url,
+            "reason": reason,
+            "added_at": datetime.utcnow().isoformat(),
+        })
+        # Begrenze auf letzte 20 Beispiele
+        self.positive_examples = self.positive_examples[-20:]
+        self.updated_at = datetime.utcnow()
+
+    def add_negative_example(self, title: str, url: str, reason: str = "") -> None:
+        """Füge ein negatives Beispiel hinzu."""
+        self.negative_examples.append({
+            "title": title,
+            "url": url,
+            "reason": reason,
+            "added_at": datetime.utcnow().isoformat(),
+        })
+        # Begrenze auf letzte 20 Beispiele
+        self.negative_examples = self.negative_examples[-20:]
+        self.updated_at = datetime.utcnow()
+
+    def update_from_feedback(self, alert_title: str, alert_url: str,
+                             is_relevant: bool, reason: str = "") -> None:
+        """
+        Aktualisiere Profil basierend auf Nutzer-Feedback.
+
+        Args:
+            alert_title: Titel des Alerts
+            alert_url: URL des Alerts
+            is_relevant: True wenn der Nutzer den Alert als relevant markiert hat
+            reason: Optional - Grund für die Entscheidung
+        """
+        if is_relevant:
+            self.add_positive_example(alert_title, alert_url, reason)
+            self.total_kept += 1
+        else:
+            self.add_negative_example(alert_title, alert_url, reason)
+            self.total_dropped += 1
+
+        self.total_scored += 1
+
+        # Aktualisiere Accuracy-Schätzung (vereinfacht)
+        if self.total_scored > 10:
+            # Hier könnte eine komplexere Berechnung erfolgen
+            # basierend auf Vergleich von Vorhersage vs. tatsächlichem Feedback
+            pass
+
+    def get_prompt_context(self) -> str:
+        """
+        Generiere Kontext für LLM-Prompt.
+
+        Dieser Text wird in den System-Prompt des Relevanz-Scorers eingefügt.
+        """
+        lines = ["## Relevanzprofil des Nutzers\n"]
+
+        # Prioritäten
+        if self.priorities:
+            lines.append("### Prioritäten (Themen von Interesse):")
+            for p in self.priorities:
+                if isinstance(p, dict):
+                    p = PriorityItem.from_dict(p)
+                weight_label = "Sehr wichtig" if p.weight > 0.7 else "Wichtig" if p.weight > 0.4 else "Interessant"
+                lines.append(f"- **{p.label}** ({weight_label})")
+                if p.description:
+                    lines.append(f"  {p.description}")
+                if p.keywords:
+                    lines.append(f"  Keywords: {', '.join(p.keywords)}")
+            lines.append("")
+
+        # Ausschlüsse
+        if self.exclusions:
+            lines.append("### Ausschlüsse (ignorieren):")
+            lines.append(f"Themen mit diesen Keywords: {', '.join(self.exclusions)}")
+            lines.append("")
+
+        # Positive Beispiele
+        if self.positive_examples:
+            lines.append("### Beispiele für relevante Alerts:")
+            for ex in self.positive_examples[-5:]:  # Letzte 5
+                lines.append(f"- \"{ex['title']}\"")
+                if ex.get("reason"):
+                    lines.append(f"  Grund: {ex['reason']}")
+            lines.append("")
+
+        # Negative Beispiele
+        if self.negative_examples:
+            lines.append("### Beispiele für irrelevante Alerts:")
+            for ex in self.negative_examples[-5:]:  # Letzte 5
+                lines.append(f"- \"{ex['title']}\"")
+                if ex.get("reason"):
+                    lines.append(f"  Grund: {ex['reason']}")
+            lines.append("")
+
+        # Policies
+        if self.policies:
+            lines.append("### Zusätzliche Regeln:")
+            for key, value in self.policies.items():
+                lines.append(f"- {key}: {value}")
+
+        return "\n".join(lines)
+
+    def to_dict(self) -> dict:
+        """Konvertiere zu Dictionary."""
+        return {
+            "id": self.id,
+            "user_id": self.user_id,
+            "priorities": [p.to_dict() if isinstance(p, PriorityItem) else p
+                          for p in self.priorities],
+            "exclusions": self.exclusions,
+            "positive_examples": self.positive_examples,
+            "negative_examples": self.negative_examples,
+            "policies": self.policies,
+            "created_at": self.created_at.isoformat(),
+            "updated_at": self.updated_at.isoformat(),
+            "total_scored": self.total_scored,
+            "total_kept": self.total_kept,
+            "total_dropped": self.total_dropped,
+            "accuracy_estimate": self.accuracy_estimate,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "RelevanceProfile":
+        """Erstelle RelevanceProfile aus Dictionary."""
+        # Parse Timestamps
+        for field_name in ["created_at", "updated_at"]:
+            if field_name in data and isinstance(data[field_name], str):
+                data[field_name] = datetime.fromisoformat(data[field_name])
+
+        # Parse Priorities
+        if "priorities" in data:
+            data["priorities"] = [
+                PriorityItem.from_dict(p) if isinstance(p, dict) else p
+                for p in data["priorities"]
+            ]
+
+        return cls(**data)
+
+    @classmethod
+    def create_default_education_profile(cls) -> "RelevanceProfile":
+        """
+        Erstelle ein Standard-Profil für Bildungsthemen.
+
+        Dieses Profil ist für Lehrkräfte/Schulpersonal optimiert.
+        """
+        profile = cls()
+
+        # Bildungs-relevante Prioritäten
+        profile.add_priority(
+            "Inklusion",
+            weight=0.9,
+            keywords=["inklusiv", "Förderbedarf", "Behinderung", "Barrierefreiheit"],
+            description="Inklusive Bildung, Förderschulen, Nachteilsausgleich"
+        )
+        profile.add_priority(
+            "Datenschutz Schule",
+            weight=0.85,
+            keywords=["DSGVO", "Schülerfotos", "Einwilligung", "personenbezogene Daten"],
+            description="DSGVO in Schulen, Datenschutz bei Klassenfotos"
+        )
+        profile.add_priority(
+            "Schulrecht Bayern",
+            weight=0.8,
+            keywords=["BayEUG", "Schulordnung", "Kultusministerium", "Bayern"],
+            description="Bayerisches Schulrecht, Verordnungen"
+        )
+        profile.add_priority(
+            "Digitalisierung Schule",
+            weight=0.7,
+            keywords=["DigitalPakt", "Tablet-Klasse", "Lernplattform"],
+            description="Digitale Medien im Unterricht"
+        )
+        profile.add_priority(
+            "Elternarbeit",
+            weight=0.6,
+            keywords=["Elternbeirat", "Elternabend", "Kommunikation"],
+            description="Zusammenarbeit mit Eltern"
+        )
+
+        # Standard-Ausschlüsse
+        profile.exclusions = [
+            "Stellenanzeige",
+            "Praktikum gesucht",
+            "Werbung",
+            "Pressemitteilung",  # Oft generisch
+        ]
+
+        # Policies
+        profile.policies = {
+            "prefer_german_sources": True,
+            "max_age_days": 30,  # Ältere Alerts ignorieren
+            "min_content_length": 100,  # Sehr kurze Snippets ignorieren
+        }
+
+        return profile
+
+    def __repr__(self) -> str:
+        return f"RelevanceProfile(id={self.id[:8]}, priorities={len(self.priorities)}, examples={len(self.positive_examples) + len(self.negative_examples)})"