Initial commit: breakpilot-core - Shared Infrastructure
Docker Compose with 24+ services: - PostgreSQL (PostGIS), Valkey, MinIO, Qdrant - Vault (PKI/TLS), Nginx (Reverse Proxy) - Backend Core API, Consent Service, Billing Service - RAG Service, Embedding Service - Gitea, Woodpecker CI/CD - Night Scheduler, Health Aggregator - Jitsi (Web/XMPP/JVB/Jicofo), Mailpit Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
12
docs-src/backend/alerts_agent/models/__init__.py
Normal file
12
docs-src/backend/alerts_agent/models/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Alert Agent Models."""
|
||||
|
||||
from .alert_item import AlertItem, AlertSource, AlertStatus
|
||||
from .relevance_profile import RelevanceProfile, PriorityItem
|
||||
|
||||
__all__ = [
|
||||
"AlertItem",
|
||||
"AlertSource",
|
||||
"AlertStatus",
|
||||
"RelevanceProfile",
|
||||
"PriorityItem",
|
||||
]
|
||||
174
docs-src/backend/alerts_agent/models/alert_item.py
Normal file
174
docs-src/backend/alerts_agent/models/alert_item.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""
|
||||
AlertItem Model.
|
||||
|
||||
Repräsentiert einen einzelnen Alert aus Google Alerts (RSS oder Email).
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
import hashlib
|
||||
import uuid
|
||||
|
||||
|
||||
class AlertSource(str, Enum):
|
||||
"""Quelle des Alerts."""
|
||||
GOOGLE_ALERTS_RSS = "google_alerts_rss"
|
||||
GOOGLE_ALERTS_EMAIL = "google_alerts_email"
|
||||
MANUAL = "manual"
|
||||
|
||||
|
||||
class AlertStatus(str, Enum):
|
||||
"""Verarbeitungsstatus des Alerts."""
|
||||
NEW = "new"
|
||||
PROCESSED = "processed"
|
||||
DUPLICATE = "duplicate"
|
||||
SCORED = "scored"
|
||||
REVIEWED = "reviewed"
|
||||
ARCHIVED = "archived"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlertItem:
|
||||
"""Ein einzelner Alert-Eintrag."""
|
||||
|
||||
# Identifikation
|
||||
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
||||
|
||||
# Quelle
|
||||
source: AlertSource = AlertSource.GOOGLE_ALERTS_RSS
|
||||
topic_label: str = "" # z.B. "Schulrecht Bayern"
|
||||
feed_url: Optional[str] = None
|
||||
|
||||
# Content
|
||||
title: str = ""
|
||||
url: str = ""
|
||||
snippet: str = ""
|
||||
article_text: Optional[str] = None
|
||||
|
||||
# Metadaten
|
||||
lang: str = "de"
|
||||
published_at: Optional[datetime] = None
|
||||
fetched_at: datetime = field(default_factory=datetime.utcnow)
|
||||
|
||||
# Deduplication
|
||||
canonical_url: Optional[str] = None
|
||||
url_hash: Optional[str] = None
|
||||
content_hash: Optional[str] = None # SimHash für fuzzy matching
|
||||
|
||||
# Verarbeitung
|
||||
status: AlertStatus = AlertStatus.NEW
|
||||
cluster_id: Optional[str] = None
|
||||
|
||||
# Relevanz (nach Scoring)
|
||||
relevance_score: Optional[float] = None # 0.0 - 1.0
|
||||
relevance_decision: Optional[str] = None # KEEP, DROP, REVIEW
|
||||
relevance_reasons: list = field(default_factory=list)
|
||||
relevance_summary: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Berechne Hashes nach Initialisierung."""
|
||||
if not self.url_hash and self.url:
|
||||
self.url_hash = self._compute_url_hash()
|
||||
if not self.canonical_url and self.url:
|
||||
self.canonical_url = self._normalize_url(self.url)
|
||||
|
||||
def _compute_url_hash(self) -> str:
|
||||
"""Berechne SHA256 Hash der URL."""
|
||||
normalized = self._normalize_url(self.url)
|
||||
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
"""Normalisiere URL für Deduplizierung."""
|
||||
# Entferne Tracking-Parameter
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
|
||||
# Google News Redirect auflösen
|
||||
if "news.google.com" in parsed.netloc and "/articles/" in parsed.path:
|
||||
# news.google.com URLs enthalten die echte URL base64-kodiert
|
||||
# Hier nur Basic-Handling - echte Auflösung komplexer
|
||||
pass
|
||||
|
||||
# Tracking-Parameter entfernen
|
||||
tracking_params = {
|
||||
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
||||
"fbclid", "gclid", "ref", "source"
|
||||
}
|
||||
|
||||
query_params = urllib.parse.parse_qs(parsed.query)
|
||||
cleaned_params = {k: v for k, v in query_params.items()
|
||||
if k.lower() not in tracking_params}
|
||||
|
||||
cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
|
||||
|
||||
# Rekonstruiere URL ohne Fragment
|
||||
normalized = urllib.parse.urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path.rstrip("/"),
|
||||
parsed.params,
|
||||
cleaned_query,
|
||||
"" # No fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
def compute_content_hash(self, text: Optional[str] = None) -> str:
|
||||
"""
|
||||
Berechne SimHash des Inhalts für Fuzzy-Matching.
|
||||
|
||||
SimHash erlaubt es, ähnliche Texte zu erkennen, auch wenn sie
|
||||
sich leicht unterscheiden (z.B. verschiedene Quellen zum selben Thema).
|
||||
"""
|
||||
from ..processing.dedup import compute_simhash
|
||||
|
||||
content = text or self.article_text or self.snippet or self.title
|
||||
if content:
|
||||
self.content_hash = compute_simhash(content)
|
||||
return self.content_hash or ""
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Konvertiere zu Dictionary für JSON/DB."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"source": self.source.value,
|
||||
"topic_label": self.topic_label,
|
||||
"feed_url": self.feed_url,
|
||||
"title": self.title,
|
||||
"url": self.url,
|
||||
"snippet": self.snippet,
|
||||
"article_text": self.article_text,
|
||||
"lang": self.lang,
|
||||
"published_at": self.published_at.isoformat() if self.published_at else None,
|
||||
"fetched_at": self.fetched_at.isoformat() if self.fetched_at else None,
|
||||
"canonical_url": self.canonical_url,
|
||||
"url_hash": self.url_hash,
|
||||
"content_hash": self.content_hash,
|
||||
"status": self.status.value,
|
||||
"cluster_id": self.cluster_id,
|
||||
"relevance_score": self.relevance_score,
|
||||
"relevance_decision": self.relevance_decision,
|
||||
"relevance_reasons": self.relevance_reasons,
|
||||
"relevance_summary": self.relevance_summary,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "AlertItem":
|
||||
"""Erstelle AlertItem aus Dictionary."""
|
||||
# Parse Enums
|
||||
if "source" in data and isinstance(data["source"], str):
|
||||
data["source"] = AlertSource(data["source"])
|
||||
if "status" in data and isinstance(data["status"], str):
|
||||
data["status"] = AlertStatus(data["status"])
|
||||
|
||||
# Parse Timestamps
|
||||
for field_name in ["published_at", "fetched_at"]:
|
||||
if field_name in data and isinstance(data[field_name], str):
|
||||
data[field_name] = datetime.fromisoformat(data[field_name])
|
||||
|
||||
return cls(**data)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"AlertItem(id={self.id[:8]}, title='{self.title[:50]}...', status={self.status.value})"
|
||||
288
docs-src/backend/alerts_agent/models/relevance_profile.py
Normal file
288
docs-src/backend/alerts_agent/models/relevance_profile.py
Normal file
@@ -0,0 +1,288 @@
|
||||
"""
|
||||
RelevanceProfile Model.
|
||||
|
||||
Definiert das Relevanzprofil eines Nutzers für die Alerts-Filterung.
|
||||
Lernt über Zeit durch Feedback.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
import uuid
|
||||
|
||||
|
||||
@dataclass
|
||||
class PriorityItem:
|
||||
"""Ein Prioritäts-Thema im Profil."""
|
||||
label: str # z.B. "Inklusion", "Datenschutz Schule"
|
||||
weight: float = 0.5 # 0.0 - 1.0, höher = wichtiger
|
||||
keywords: list = field(default_factory=list) # Zusätzliche Keywords
|
||||
description: Optional[str] = None # Kontext für LLM
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"label": self.label,
|
||||
"weight": self.weight,
|
||||
"keywords": self.keywords,
|
||||
"description": self.description,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "PriorityItem":
|
||||
return cls(**data)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RelevanceProfile:
|
||||
"""
|
||||
Nutzerprofil für Relevanz-Scoring.
|
||||
|
||||
Das Profil wird verwendet, um Alerts auf Relevanz zu prüfen.
|
||||
Es enthält:
|
||||
- Prioritäten: Themen die wichtig sind (mit Gewichtung)
|
||||
- Ausschlüsse: Themen die ignoriert werden sollen
|
||||
- Positive Beispiele: URLs/Titel die relevant waren
|
||||
- Negative Beispiele: URLs/Titel die irrelevant waren
|
||||
- Policies: Zusätzliche Regeln (z.B. nur deutsche Quellen)
|
||||
"""
|
||||
|
||||
# Identifikation
|
||||
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
||||
user_id: Optional[str] = None # Falls benutzerspezifisch
|
||||
|
||||
# Relevanz-Kriterien
|
||||
priorities: list = field(default_factory=list) # List[PriorityItem]
|
||||
exclusions: list = field(default_factory=list) # Keywords zum Ausschließen
|
||||
|
||||
# Beispiele für Few-Shot Learning
|
||||
positive_examples: list = field(default_factory=list) # Relevante Alerts
|
||||
negative_examples: list = field(default_factory=list) # Irrelevante Alerts
|
||||
|
||||
# Policies
|
||||
policies: dict = field(default_factory=dict)
|
||||
|
||||
# Metadaten
|
||||
created_at: datetime = field(default_factory=datetime.utcnow)
|
||||
updated_at: datetime = field(default_factory=datetime.utcnow)
|
||||
|
||||
# Statistiken
|
||||
total_scored: int = 0
|
||||
total_kept: int = 0
|
||||
total_dropped: int = 0
|
||||
accuracy_estimate: Optional[float] = None # Geschätzte Genauigkeit
|
||||
|
||||
def add_priority(self, label: str, weight: float = 0.5, **kwargs) -> None:
|
||||
"""Füge ein Prioritäts-Thema hinzu."""
|
||||
self.priorities.append(PriorityItem(
|
||||
label=label,
|
||||
weight=weight,
|
||||
**kwargs
|
||||
))
|
||||
self.updated_at = datetime.utcnow()
|
||||
|
||||
def add_exclusion(self, keyword: str) -> None:
|
||||
"""Füge ein Ausschluss-Keyword hinzu."""
|
||||
if keyword not in self.exclusions:
|
||||
self.exclusions.append(keyword)
|
||||
self.updated_at = datetime.utcnow()
|
||||
|
||||
def add_positive_example(self, title: str, url: str, reason: str = "") -> None:
|
||||
"""Füge ein positives Beispiel hinzu (für Few-Shot Learning)."""
|
||||
self.positive_examples.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"reason": reason,
|
||||
"added_at": datetime.utcnow().isoformat(),
|
||||
})
|
||||
# Begrenze auf letzte 20 Beispiele
|
||||
self.positive_examples = self.positive_examples[-20:]
|
||||
self.updated_at = datetime.utcnow()
|
||||
|
||||
def add_negative_example(self, title: str, url: str, reason: str = "") -> None:
|
||||
"""Füge ein negatives Beispiel hinzu."""
|
||||
self.negative_examples.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"reason": reason,
|
||||
"added_at": datetime.utcnow().isoformat(),
|
||||
})
|
||||
# Begrenze auf letzte 20 Beispiele
|
||||
self.negative_examples = self.negative_examples[-20:]
|
||||
self.updated_at = datetime.utcnow()
|
||||
|
||||
def update_from_feedback(self, alert_title: str, alert_url: str,
|
||||
is_relevant: bool, reason: str = "") -> None:
|
||||
"""
|
||||
Aktualisiere Profil basierend auf Nutzer-Feedback.
|
||||
|
||||
Args:
|
||||
alert_title: Titel des Alerts
|
||||
alert_url: URL des Alerts
|
||||
is_relevant: True wenn der Nutzer den Alert als relevant markiert hat
|
||||
reason: Optional - Grund für die Entscheidung
|
||||
"""
|
||||
if is_relevant:
|
||||
self.add_positive_example(alert_title, alert_url, reason)
|
||||
self.total_kept += 1
|
||||
else:
|
||||
self.add_negative_example(alert_title, alert_url, reason)
|
||||
self.total_dropped += 1
|
||||
|
||||
self.total_scored += 1
|
||||
|
||||
# Aktualisiere Accuracy-Schätzung (vereinfacht)
|
||||
if self.total_scored > 10:
|
||||
# Hier könnte eine komplexere Berechnung erfolgen
|
||||
# basierend auf Vergleich von Vorhersage vs. tatsächlichem Feedback
|
||||
pass
|
||||
|
||||
def get_prompt_context(self) -> str:
|
||||
"""
|
||||
Generiere Kontext für LLM-Prompt.
|
||||
|
||||
Dieser Text wird in den System-Prompt des Relevanz-Scorers eingefügt.
|
||||
"""
|
||||
lines = ["## Relevanzprofil des Nutzers\n"]
|
||||
|
||||
# Prioritäten
|
||||
if self.priorities:
|
||||
lines.append("### Prioritäten (Themen von Interesse):")
|
||||
for p in self.priorities:
|
||||
if isinstance(p, dict):
|
||||
p = PriorityItem.from_dict(p)
|
||||
weight_label = "Sehr wichtig" if p.weight > 0.7 else "Wichtig" if p.weight > 0.4 else "Interessant"
|
||||
lines.append(f"- **{p.label}** ({weight_label})")
|
||||
if p.description:
|
||||
lines.append(f" {p.description}")
|
||||
if p.keywords:
|
||||
lines.append(f" Keywords: {', '.join(p.keywords)}")
|
||||
lines.append("")
|
||||
|
||||
# Ausschlüsse
|
||||
if self.exclusions:
|
||||
lines.append("### Ausschlüsse (ignorieren):")
|
||||
lines.append(f"Themen mit diesen Keywords: {', '.join(self.exclusions)}")
|
||||
lines.append("")
|
||||
|
||||
# Positive Beispiele
|
||||
if self.positive_examples:
|
||||
lines.append("### Beispiele für relevante Alerts:")
|
||||
for ex in self.positive_examples[-5:]: # Letzte 5
|
||||
lines.append(f"- \"{ex['title']}\"")
|
||||
if ex.get("reason"):
|
||||
lines.append(f" Grund: {ex['reason']}")
|
||||
lines.append("")
|
||||
|
||||
# Negative Beispiele
|
||||
if self.negative_examples:
|
||||
lines.append("### Beispiele für irrelevante Alerts:")
|
||||
for ex in self.negative_examples[-5:]: # Letzte 5
|
||||
lines.append(f"- \"{ex['title']}\"")
|
||||
if ex.get("reason"):
|
||||
lines.append(f" Grund: {ex['reason']}")
|
||||
lines.append("")
|
||||
|
||||
# Policies
|
||||
if self.policies:
|
||||
lines.append("### Zusätzliche Regeln:")
|
||||
for key, value in self.policies.items():
|
||||
lines.append(f"- {key}: {value}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Konvertiere zu Dictionary."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"user_id": self.user_id,
|
||||
"priorities": [p.to_dict() if isinstance(p, PriorityItem) else p
|
||||
for p in self.priorities],
|
||||
"exclusions": self.exclusions,
|
||||
"positive_examples": self.positive_examples,
|
||||
"negative_examples": self.negative_examples,
|
||||
"policies": self.policies,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
"updated_at": self.updated_at.isoformat(),
|
||||
"total_scored": self.total_scored,
|
||||
"total_kept": self.total_kept,
|
||||
"total_dropped": self.total_dropped,
|
||||
"accuracy_estimate": self.accuracy_estimate,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "RelevanceProfile":
|
||||
"""Erstelle RelevanceProfile aus Dictionary."""
|
||||
# Parse Timestamps
|
||||
for field_name in ["created_at", "updated_at"]:
|
||||
if field_name in data and isinstance(data[field_name], str):
|
||||
data[field_name] = datetime.fromisoformat(data[field_name])
|
||||
|
||||
# Parse Priorities
|
||||
if "priorities" in data:
|
||||
data["priorities"] = [
|
||||
PriorityItem.from_dict(p) if isinstance(p, dict) else p
|
||||
for p in data["priorities"]
|
||||
]
|
||||
|
||||
return cls(**data)
|
||||
|
||||
@classmethod
|
||||
def create_default_education_profile(cls) -> "RelevanceProfile":
|
||||
"""
|
||||
Erstelle ein Standard-Profil für Bildungsthemen.
|
||||
|
||||
Dieses Profil ist für Lehrkräfte/Schulpersonal optimiert.
|
||||
"""
|
||||
profile = cls()
|
||||
|
||||
# Bildungs-relevante Prioritäten
|
||||
profile.add_priority(
|
||||
"Inklusion",
|
||||
weight=0.9,
|
||||
keywords=["inklusiv", "Förderbedarf", "Behinderung", "Barrierefreiheit"],
|
||||
description="Inklusive Bildung, Förderschulen, Nachteilsausgleich"
|
||||
)
|
||||
profile.add_priority(
|
||||
"Datenschutz Schule",
|
||||
weight=0.85,
|
||||
keywords=["DSGVO", "Schülerfotos", "Einwilligung", "personenbezogene Daten"],
|
||||
description="DSGVO in Schulen, Datenschutz bei Klassenfotos"
|
||||
)
|
||||
profile.add_priority(
|
||||
"Schulrecht Bayern",
|
||||
weight=0.8,
|
||||
keywords=["BayEUG", "Schulordnung", "Kultusministerium", "Bayern"],
|
||||
description="Bayerisches Schulrecht, Verordnungen"
|
||||
)
|
||||
profile.add_priority(
|
||||
"Digitalisierung Schule",
|
||||
weight=0.7,
|
||||
keywords=["DigitalPakt", "Tablet-Klasse", "Lernplattform"],
|
||||
description="Digitale Medien im Unterricht"
|
||||
)
|
||||
profile.add_priority(
|
||||
"Elternarbeit",
|
||||
weight=0.6,
|
||||
keywords=["Elternbeirat", "Elternabend", "Kommunikation"],
|
||||
description="Zusammenarbeit mit Eltern"
|
||||
)
|
||||
|
||||
# Standard-Ausschlüsse
|
||||
profile.exclusions = [
|
||||
"Stellenanzeige",
|
||||
"Praktikum gesucht",
|
||||
"Werbung",
|
||||
"Pressemitteilung", # Oft generisch
|
||||
]
|
||||
|
||||
# Policies
|
||||
profile.policies = {
|
||||
"prefer_german_sources": True,
|
||||
"max_age_days": 30, # Ältere Alerts ignorieren
|
||||
"min_content_length": 100, # Sehr kurze Snippets ignorieren
|
||||
}
|
||||
|
||||
return profile
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"RelevanceProfile(id={self.id[:8]}, priorities={len(self.priorities)}, examples={len(self.positive_examples) + len(self.negative_examples)})"
|
||||
Reference in New Issue
Block a user