backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
395 lines
12 KiB
Python
395 lines
12 KiB
Python
"""
|
|
Repository für Alert Items (einzelne Alerts/Artikel).
|
|
"""
|
|
import hashlib
|
|
import urllib.parse
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional, List, Dict, Any
|
|
from sqlalchemy.orm import Session as DBSession
|
|
from sqlalchemy import or_, func
|
|
|
|
from .models import (
|
|
AlertItemDB, AlertSourceEnum, AlertStatusEnum, RelevanceDecisionEnum
|
|
)
|
|
|
|
|
|
class AlertItemRepository:
|
|
"""Repository für Alert Items (einzelne Alerts/Artikel)."""
|
|
|
|
def __init__(self, db: DBSession):
|
|
self.db = db
|
|
|
|
# ==================== CREATE ====================
|
|
|
|
def create(
|
|
self,
|
|
topic_id: str,
|
|
title: str,
|
|
url: str,
|
|
snippet: str = "",
|
|
source: str = "google_alerts_rss",
|
|
published_at: datetime = None,
|
|
lang: str = "de",
|
|
) -> AlertItemDB:
|
|
"""Erstellt einen neuen Alert."""
|
|
url_hash = self._compute_url_hash(url)
|
|
|
|
alert = AlertItemDB(
|
|
id=str(uuid.uuid4()),
|
|
topic_id=topic_id,
|
|
title=title,
|
|
url=url,
|
|
snippet=snippet,
|
|
source=AlertSourceEnum(source),
|
|
published_at=published_at,
|
|
lang=lang,
|
|
url_hash=url_hash,
|
|
canonical_url=self._normalize_url(url),
|
|
)
|
|
self.db.add(alert)
|
|
self.db.commit()
|
|
self.db.refresh(alert)
|
|
return alert
|
|
|
|
def create_if_not_exists(
|
|
self,
|
|
topic_id: str,
|
|
title: str,
|
|
url: str,
|
|
snippet: str = "",
|
|
source: str = "google_alerts_rss",
|
|
published_at: datetime = None,
|
|
) -> Optional[AlertItemDB]:
|
|
"""Erstellt einen Alert nur wenn URL noch nicht existiert."""
|
|
url_hash = self._compute_url_hash(url)
|
|
|
|
existing = self.db.query(AlertItemDB).filter(
|
|
AlertItemDB.url_hash == url_hash
|
|
).first()
|
|
|
|
if existing:
|
|
return None # Duplikat
|
|
|
|
return self.create(
|
|
topic_id=topic_id,
|
|
title=title,
|
|
url=url,
|
|
snippet=snippet,
|
|
source=source,
|
|
published_at=published_at,
|
|
)
|
|
|
|
# ==================== READ ====================
|
|
|
|
def get_by_id(self, alert_id: str) -> Optional[AlertItemDB]:
|
|
"""Holt einen Alert nach ID."""
|
|
return self.db.query(AlertItemDB).filter(
|
|
AlertItemDB.id == alert_id
|
|
).first()
|
|
|
|
def get_by_url_hash(self, url_hash: str) -> Optional[AlertItemDB]:
|
|
"""Holt einen Alert nach URL-Hash."""
|
|
return self.db.query(AlertItemDB).filter(
|
|
AlertItemDB.url_hash == url_hash
|
|
).first()
|
|
|
|
def get_inbox(
|
|
self,
|
|
user_id: str = None,
|
|
topic_id: str = None,
|
|
decision: str = None,
|
|
status: str = None,
|
|
limit: int = 50,
|
|
offset: int = 0,
|
|
) -> List[AlertItemDB]:
|
|
"""
|
|
Holt Inbox-Items mit Filtern.
|
|
|
|
Ohne decision werden KEEP und REVIEW angezeigt.
|
|
"""
|
|
query = self.db.query(AlertItemDB)
|
|
|
|
if topic_id:
|
|
query = query.filter(AlertItemDB.topic_id == topic_id)
|
|
|
|
if decision:
|
|
query = query.filter(
|
|
AlertItemDB.relevance_decision == RelevanceDecisionEnum(decision)
|
|
)
|
|
else:
|
|
# Default: KEEP und REVIEW
|
|
query = query.filter(
|
|
or_(
|
|
AlertItemDB.relevance_decision == RelevanceDecisionEnum.KEEP,
|
|
AlertItemDB.relevance_decision == RelevanceDecisionEnum.REVIEW,
|
|
AlertItemDB.relevance_decision.is_(None)
|
|
)
|
|
)
|
|
|
|
if status:
|
|
query = query.filter(AlertItemDB.status == AlertStatusEnum(status))
|
|
|
|
return query.order_by(
|
|
AlertItemDB.relevance_score.desc().nullslast(),
|
|
AlertItemDB.fetched_at.desc()
|
|
).offset(offset).limit(limit).all()
|
|
|
|
def get_unscored(
|
|
self,
|
|
topic_id: str = None,
|
|
limit: int = 100,
|
|
) -> List[AlertItemDB]:
|
|
"""Holt alle unbewerteten Alerts."""
|
|
query = self.db.query(AlertItemDB).filter(
|
|
AlertItemDB.status == AlertStatusEnum.NEW
|
|
)
|
|
|
|
if topic_id:
|
|
query = query.filter(AlertItemDB.topic_id == topic_id)
|
|
|
|
return query.order_by(AlertItemDB.fetched_at.desc()).limit(limit).all()
|
|
|
|
def get_by_topic(
|
|
self,
|
|
topic_id: str,
|
|
limit: int = 100,
|
|
offset: int = 0,
|
|
) -> List[AlertItemDB]:
|
|
"""Holt alle Alerts eines Topics."""
|
|
return self.db.query(AlertItemDB).filter(
|
|
AlertItemDB.topic_id == topic_id
|
|
).order_by(
|
|
AlertItemDB.fetched_at.desc()
|
|
).offset(offset).limit(limit).all()
|
|
|
|
def count_by_status(self, topic_id: str = None) -> Dict[str, int]:
|
|
"""Zählt Alerts nach Status."""
|
|
query = self.db.query(
|
|
AlertItemDB.status,
|
|
func.count(AlertItemDB.id).label('count')
|
|
)
|
|
|
|
if topic_id:
|
|
query = query.filter(AlertItemDB.topic_id == topic_id)
|
|
|
|
results = query.group_by(AlertItemDB.status).all()
|
|
|
|
return {r[0].value: r[1] for r in results}
|
|
|
|
def count_by_decision(self, topic_id: str = None) -> Dict[str, int]:
|
|
"""Zählt Alerts nach Relevanz-Entscheidung."""
|
|
query = self.db.query(
|
|
AlertItemDB.relevance_decision,
|
|
func.count(AlertItemDB.id).label('count')
|
|
)
|
|
|
|
if topic_id:
|
|
query = query.filter(AlertItemDB.topic_id == topic_id)
|
|
|
|
results = query.group_by(AlertItemDB.relevance_decision).all()
|
|
|
|
return {
|
|
(r[0].value if r[0] else "unscored"): r[1]
|
|
for r in results
|
|
}
|
|
|
|
# ==================== UPDATE ====================
|
|
|
|
def update_scoring(
|
|
self,
|
|
alert_id: str,
|
|
score: float,
|
|
decision: str,
|
|
reasons: List[str] = None,
|
|
summary: str = None,
|
|
model: str = None,
|
|
) -> Optional[AlertItemDB]:
|
|
"""Aktualisiert das Scoring eines Alerts."""
|
|
alert = self.get_by_id(alert_id)
|
|
if not alert:
|
|
return None
|
|
|
|
alert.relevance_score = score
|
|
alert.relevance_decision = RelevanceDecisionEnum(decision)
|
|
alert.relevance_reasons = reasons or []
|
|
alert.relevance_summary = summary
|
|
alert.scored_by_model = model
|
|
alert.scored_at = datetime.utcnow()
|
|
alert.status = AlertStatusEnum.SCORED
|
|
alert.processed_at = datetime.utcnow()
|
|
|
|
self.db.commit()
|
|
self.db.refresh(alert)
|
|
return alert
|
|
|
|
def update_status(
|
|
self,
|
|
alert_id: str,
|
|
status: str,
|
|
) -> Optional[AlertItemDB]:
|
|
"""Aktualisiert den Status eines Alerts."""
|
|
alert = self.get_by_id(alert_id)
|
|
if not alert:
|
|
return None
|
|
|
|
alert.status = AlertStatusEnum(status)
|
|
|
|
self.db.commit()
|
|
self.db.refresh(alert)
|
|
return alert
|
|
|
|
def mark_reviewed(
|
|
self,
|
|
alert_id: str,
|
|
is_relevant: bool,
|
|
notes: str = None,
|
|
tags: List[str] = None,
|
|
) -> Optional[AlertItemDB]:
|
|
"""Markiert einen Alert als reviewed mit Feedback."""
|
|
alert = self.get_by_id(alert_id)
|
|
if not alert:
|
|
return None
|
|
|
|
alert.status = AlertStatusEnum.REVIEWED
|
|
alert.user_marked_relevant = is_relevant
|
|
if notes:
|
|
alert.user_notes = notes
|
|
if tags:
|
|
alert.user_tags = tags
|
|
|
|
self.db.commit()
|
|
self.db.refresh(alert)
|
|
return alert
|
|
|
|
def archive(self, alert_id: str) -> Optional[AlertItemDB]:
|
|
"""Archiviert einen Alert."""
|
|
return self.update_status(alert_id, "archived")
|
|
|
|
# ==================== DELETE ====================
|
|
|
|
def delete(self, alert_id: str) -> bool:
|
|
"""Löscht einen Alert."""
|
|
alert = self.get_by_id(alert_id)
|
|
if not alert:
|
|
return False
|
|
|
|
self.db.delete(alert)
|
|
self.db.commit()
|
|
return True
|
|
|
|
def delete_old(self, days: int = 90, topic_id: str = None) -> int:
|
|
"""Löscht alte archivierte Alerts."""
|
|
cutoff = datetime.utcnow() - timedelta(days=days)
|
|
|
|
query = self.db.query(AlertItemDB).filter(
|
|
AlertItemDB.status == AlertStatusEnum.ARCHIVED,
|
|
AlertItemDB.fetched_at < cutoff,
|
|
)
|
|
|
|
if topic_id:
|
|
query = query.filter(AlertItemDB.topic_id == topic_id)
|
|
|
|
count = query.delete()
|
|
self.db.commit()
|
|
return count
|
|
|
|
# ==================== FOR RSS FETCHER ====================
|
|
|
|
def get_existing_urls(self, topic_id: str) -> set:
|
|
"""
|
|
Holt alle bekannten URL-Hashes für ein Topic.
|
|
|
|
Wird vom RSS-Fetcher verwendet um Duplikate zu vermeiden.
|
|
"""
|
|
results = self.db.query(AlertItemDB.url_hash).filter(
|
|
AlertItemDB.topic_id == topic_id
|
|
).all()
|
|
|
|
return {r[0] for r in results if r[0]}
|
|
|
|
def create_from_alert_item(self, alert_item, topic_id: str) -> AlertItemDB:
|
|
"""
|
|
Erstellt einen Alert aus einem AlertItem-Objekt vom RSS-Fetcher.
|
|
|
|
Args:
|
|
alert_item: AlertItem from rss_fetcher
|
|
topic_id: Topic ID to associate with
|
|
|
|
Returns:
|
|
Created AlertItemDB instance
|
|
"""
|
|
return self.create(
|
|
topic_id=topic_id,
|
|
title=alert_item.title,
|
|
url=alert_item.url,
|
|
snippet=alert_item.snippet or "",
|
|
source=alert_item.source.value if hasattr(alert_item.source, 'value') else str(alert_item.source),
|
|
published_at=alert_item.published_at,
|
|
)
|
|
|
|
# ==================== HELPER ====================
|
|
|
|
def _compute_url_hash(self, url: str) -> str:
|
|
"""Berechnet SHA256 Hash der normalisierten URL."""
|
|
normalized = self._normalize_url(url)
|
|
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
|
|
|
def _normalize_url(self, url: str) -> str:
|
|
"""Normalisiert URL für Deduplizierung."""
|
|
parsed = urllib.parse.urlparse(url)
|
|
|
|
# Tracking-Parameter entfernen
|
|
tracking_params = {
|
|
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
|
"fbclid", "gclid", "ref", "source"
|
|
}
|
|
|
|
query_params = urllib.parse.parse_qs(parsed.query)
|
|
cleaned_params = {k: v for k, v in query_params.items()
|
|
if k.lower() not in tracking_params}
|
|
|
|
cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
|
|
|
|
# Rekonstruiere URL ohne Fragment
|
|
normalized = urllib.parse.urlunparse((
|
|
parsed.scheme,
|
|
parsed.netloc.lower(),
|
|
parsed.path.rstrip("/"),
|
|
parsed.params,
|
|
cleaned_query,
|
|
"" # No fragment
|
|
))
|
|
|
|
return normalized
|
|
|
|
# ==================== CONVERSION ====================
|
|
|
|
def to_dict(self, alert: AlertItemDB) -> Dict[str, Any]:
|
|
"""Konvertiert DB-Model zu Dictionary."""
|
|
return {
|
|
"id": alert.id,
|
|
"topic_id": alert.topic_id,
|
|
"title": alert.title,
|
|
"url": alert.url,
|
|
"snippet": alert.snippet,
|
|
"source": alert.source.value,
|
|
"lang": alert.lang,
|
|
"published_at": alert.published_at.isoformat() if alert.published_at else None,
|
|
"fetched_at": alert.fetched_at.isoformat() if alert.fetched_at else None,
|
|
"status": alert.status.value,
|
|
"relevance": {
|
|
"score": alert.relevance_score,
|
|
"decision": alert.relevance_decision.value if alert.relevance_decision else None,
|
|
"reasons": alert.relevance_reasons,
|
|
"summary": alert.relevance_summary,
|
|
"model": alert.scored_by_model,
|
|
"scored_at": alert.scored_at.isoformat() if alert.scored_at else None,
|
|
},
|
|
"user_feedback": {
|
|
"marked_relevant": alert.user_marked_relevant,
|
|
"tags": alert.user_tags,
|
|
"notes": alert.user_notes,
|
|
},
|
|
}
|