Files
breakpilot-lehrer/backend-lehrer/alerts_agent/db/item_repository.py
Benjamin Admin b6983ab1dc [split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files):
- alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3)
- teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3)
- mail/mail_db.py (987 → 6)

klausur-service (5 files):
- legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4)
- ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2)
- KorrekturPage.tsx (956 → 6)

website (5 pages):
- mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7)
- ocr-labeling (946 → 7), audit-workspace (871 → 4)

studio-v2 (5 files + 1 deleted):
- page.tsx (946 → 5), MessagesContext.tsx (925 → 4)
- korrektur (914 → 6), worksheet-cleanup (899 → 6)
- useVocabWorksheet.ts (888 → 3)
- Deleted dead page-original.tsx (934 LOC)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00

395 lines
12 KiB
Python

"""
Repository für Alert Items (einzelne Alerts/Artikel).
"""
import hashlib
import urllib.parse
import uuid
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any
from sqlalchemy.orm import Session as DBSession
from sqlalchemy import or_, func
from .models import (
AlertItemDB, AlertSourceEnum, AlertStatusEnum, RelevanceDecisionEnum
)
class AlertItemRepository:
"""Repository für Alert Items (einzelne Alerts/Artikel)."""
def __init__(self, db: DBSession):
self.db = db
# ==================== CREATE ====================
def create(
self,
topic_id: str,
title: str,
url: str,
snippet: str = "",
source: str = "google_alerts_rss",
published_at: datetime = None,
lang: str = "de",
) -> AlertItemDB:
"""Erstellt einen neuen Alert."""
url_hash = self._compute_url_hash(url)
alert = AlertItemDB(
id=str(uuid.uuid4()),
topic_id=topic_id,
title=title,
url=url,
snippet=snippet,
source=AlertSourceEnum(source),
published_at=published_at,
lang=lang,
url_hash=url_hash,
canonical_url=self._normalize_url(url),
)
self.db.add(alert)
self.db.commit()
self.db.refresh(alert)
return alert
def create_if_not_exists(
self,
topic_id: str,
title: str,
url: str,
snippet: str = "",
source: str = "google_alerts_rss",
published_at: datetime = None,
) -> Optional[AlertItemDB]:
"""Erstellt einen Alert nur wenn URL noch nicht existiert."""
url_hash = self._compute_url_hash(url)
existing = self.db.query(AlertItemDB).filter(
AlertItemDB.url_hash == url_hash
).first()
if existing:
return None # Duplikat
return self.create(
topic_id=topic_id,
title=title,
url=url,
snippet=snippet,
source=source,
published_at=published_at,
)
# ==================== READ ====================
def get_by_id(self, alert_id: str) -> Optional[AlertItemDB]:
"""Holt einen Alert nach ID."""
return self.db.query(AlertItemDB).filter(
AlertItemDB.id == alert_id
).first()
def get_by_url_hash(self, url_hash: str) -> Optional[AlertItemDB]:
"""Holt einen Alert nach URL-Hash."""
return self.db.query(AlertItemDB).filter(
AlertItemDB.url_hash == url_hash
).first()
def get_inbox(
self,
user_id: str = None,
topic_id: str = None,
decision: str = None,
status: str = None,
limit: int = 50,
offset: int = 0,
) -> List[AlertItemDB]:
"""
Holt Inbox-Items mit Filtern.
Ohne decision werden KEEP und REVIEW angezeigt.
"""
query = self.db.query(AlertItemDB)
if topic_id:
query = query.filter(AlertItemDB.topic_id == topic_id)
if decision:
query = query.filter(
AlertItemDB.relevance_decision == RelevanceDecisionEnum(decision)
)
else:
# Default: KEEP und REVIEW
query = query.filter(
or_(
AlertItemDB.relevance_decision == RelevanceDecisionEnum.KEEP,
AlertItemDB.relevance_decision == RelevanceDecisionEnum.REVIEW,
AlertItemDB.relevance_decision.is_(None)
)
)
if status:
query = query.filter(AlertItemDB.status == AlertStatusEnum(status))
return query.order_by(
AlertItemDB.relevance_score.desc().nullslast(),
AlertItemDB.fetched_at.desc()
).offset(offset).limit(limit).all()
def get_unscored(
self,
topic_id: str = None,
limit: int = 100,
) -> List[AlertItemDB]:
"""Holt alle unbewerteten Alerts."""
query = self.db.query(AlertItemDB).filter(
AlertItemDB.status == AlertStatusEnum.NEW
)
if topic_id:
query = query.filter(AlertItemDB.topic_id == topic_id)
return query.order_by(AlertItemDB.fetched_at.desc()).limit(limit).all()
def get_by_topic(
self,
topic_id: str,
limit: int = 100,
offset: int = 0,
) -> List[AlertItemDB]:
"""Holt alle Alerts eines Topics."""
return self.db.query(AlertItemDB).filter(
AlertItemDB.topic_id == topic_id
).order_by(
AlertItemDB.fetched_at.desc()
).offset(offset).limit(limit).all()
def count_by_status(self, topic_id: str = None) -> Dict[str, int]:
"""Zählt Alerts nach Status."""
query = self.db.query(
AlertItemDB.status,
func.count(AlertItemDB.id).label('count')
)
if topic_id:
query = query.filter(AlertItemDB.topic_id == topic_id)
results = query.group_by(AlertItemDB.status).all()
return {r[0].value: r[1] for r in results}
def count_by_decision(self, topic_id: str = None) -> Dict[str, int]:
"""Zählt Alerts nach Relevanz-Entscheidung."""
query = self.db.query(
AlertItemDB.relevance_decision,
func.count(AlertItemDB.id).label('count')
)
if topic_id:
query = query.filter(AlertItemDB.topic_id == topic_id)
results = query.group_by(AlertItemDB.relevance_decision).all()
return {
(r[0].value if r[0] else "unscored"): r[1]
for r in results
}
# ==================== UPDATE ====================
def update_scoring(
self,
alert_id: str,
score: float,
decision: str,
reasons: List[str] = None,
summary: str = None,
model: str = None,
) -> Optional[AlertItemDB]:
"""Aktualisiert das Scoring eines Alerts."""
alert = self.get_by_id(alert_id)
if not alert:
return None
alert.relevance_score = score
alert.relevance_decision = RelevanceDecisionEnum(decision)
alert.relevance_reasons = reasons or []
alert.relevance_summary = summary
alert.scored_by_model = model
alert.scored_at = datetime.utcnow()
alert.status = AlertStatusEnum.SCORED
alert.processed_at = datetime.utcnow()
self.db.commit()
self.db.refresh(alert)
return alert
def update_status(
self,
alert_id: str,
status: str,
) -> Optional[AlertItemDB]:
"""Aktualisiert den Status eines Alerts."""
alert = self.get_by_id(alert_id)
if not alert:
return None
alert.status = AlertStatusEnum(status)
self.db.commit()
self.db.refresh(alert)
return alert
def mark_reviewed(
self,
alert_id: str,
is_relevant: bool,
notes: str = None,
tags: List[str] = None,
) -> Optional[AlertItemDB]:
"""Markiert einen Alert als reviewed mit Feedback."""
alert = self.get_by_id(alert_id)
if not alert:
return None
alert.status = AlertStatusEnum.REVIEWED
alert.user_marked_relevant = is_relevant
if notes:
alert.user_notes = notes
if tags:
alert.user_tags = tags
self.db.commit()
self.db.refresh(alert)
return alert
def archive(self, alert_id: str) -> Optional[AlertItemDB]:
"""Archiviert einen Alert."""
return self.update_status(alert_id, "archived")
# ==================== DELETE ====================
def delete(self, alert_id: str) -> bool:
"""Löscht einen Alert."""
alert = self.get_by_id(alert_id)
if not alert:
return False
self.db.delete(alert)
self.db.commit()
return True
def delete_old(self, days: int = 90, topic_id: str = None) -> int:
"""Löscht alte archivierte Alerts."""
cutoff = datetime.utcnow() - timedelta(days=days)
query = self.db.query(AlertItemDB).filter(
AlertItemDB.status == AlertStatusEnum.ARCHIVED,
AlertItemDB.fetched_at < cutoff,
)
if topic_id:
query = query.filter(AlertItemDB.topic_id == topic_id)
count = query.delete()
self.db.commit()
return count
# ==================== FOR RSS FETCHER ====================
def get_existing_urls(self, topic_id: str) -> set:
"""
Holt alle bekannten URL-Hashes für ein Topic.
Wird vom RSS-Fetcher verwendet um Duplikate zu vermeiden.
"""
results = self.db.query(AlertItemDB.url_hash).filter(
AlertItemDB.topic_id == topic_id
).all()
return {r[0] for r in results if r[0]}
def create_from_alert_item(self, alert_item, topic_id: str) -> AlertItemDB:
"""
Erstellt einen Alert aus einem AlertItem-Objekt vom RSS-Fetcher.
Args:
alert_item: AlertItem from rss_fetcher
topic_id: Topic ID to associate with
Returns:
Created AlertItemDB instance
"""
return self.create(
topic_id=topic_id,
title=alert_item.title,
url=alert_item.url,
snippet=alert_item.snippet or "",
source=alert_item.source.value if hasattr(alert_item.source, 'value') else str(alert_item.source),
published_at=alert_item.published_at,
)
# ==================== HELPER ====================
def _compute_url_hash(self, url: str) -> str:
"""Berechnet SHA256 Hash der normalisierten URL."""
normalized = self._normalize_url(url)
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
def _normalize_url(self, url: str) -> str:
"""Normalisiert URL für Deduplizierung."""
parsed = urllib.parse.urlparse(url)
# Tracking-Parameter entfernen
tracking_params = {
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
"fbclid", "gclid", "ref", "source"
}
query_params = urllib.parse.parse_qs(parsed.query)
cleaned_params = {k: v for k, v in query_params.items()
if k.lower() not in tracking_params}
cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
# Rekonstruiere URL ohne Fragment
normalized = urllib.parse.urlunparse((
parsed.scheme,
parsed.netloc.lower(),
parsed.path.rstrip("/"),
parsed.params,
cleaned_query,
"" # No fragment
))
return normalized
# ==================== CONVERSION ====================
def to_dict(self, alert: AlertItemDB) -> Dict[str, Any]:
"""Konvertiert DB-Model zu Dictionary."""
return {
"id": alert.id,
"topic_id": alert.topic_id,
"title": alert.title,
"url": alert.url,
"snippet": alert.snippet,
"source": alert.source.value,
"lang": alert.lang,
"published_at": alert.published_at.isoformat() if alert.published_at else None,
"fetched_at": alert.fetched_at.isoformat() if alert.fetched_at else None,
"status": alert.status.value,
"relevance": {
"score": alert.relevance_score,
"decision": alert.relevance_decision.value if alert.relevance_decision else None,
"reasons": alert.relevance_reasons,
"summary": alert.relevance_summary,
"model": alert.scored_by_model,
"scored_at": alert.scored_at.isoformat() if alert.scored_at else None,
},
"user_feedback": {
"marked_relevant": alert.user_marked_relevant,
"tags": alert.user_tags,
"notes": alert.user_notes,
},
}