[split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
394
backend-lehrer/alerts_agent/db/item_repository.py
Normal file
394
backend-lehrer/alerts_agent/db/item_repository.py
Normal file
@@ -0,0 +1,394 @@
|
||||
"""
|
||||
Repository für Alert Items (einzelne Alerts/Artikel).
|
||||
"""
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, List, Dict, Any
|
||||
from sqlalchemy.orm import Session as DBSession
|
||||
from sqlalchemy import or_, func
|
||||
|
||||
from .models import (
|
||||
AlertItemDB, AlertSourceEnum, AlertStatusEnum, RelevanceDecisionEnum
|
||||
)
|
||||
|
||||
|
||||
class AlertItemRepository:
|
||||
"""Repository für Alert Items (einzelne Alerts/Artikel)."""
|
||||
|
||||
def __init__(self, db: DBSession):
|
||||
self.db = db
|
||||
|
||||
# ==================== CREATE ====================
|
||||
|
||||
def create(
|
||||
self,
|
||||
topic_id: str,
|
||||
title: str,
|
||||
url: str,
|
||||
snippet: str = "",
|
||||
source: str = "google_alerts_rss",
|
||||
published_at: datetime = None,
|
||||
lang: str = "de",
|
||||
) -> AlertItemDB:
|
||||
"""Erstellt einen neuen Alert."""
|
||||
url_hash = self._compute_url_hash(url)
|
||||
|
||||
alert = AlertItemDB(
|
||||
id=str(uuid.uuid4()),
|
||||
topic_id=topic_id,
|
||||
title=title,
|
||||
url=url,
|
||||
snippet=snippet,
|
||||
source=AlertSourceEnum(source),
|
||||
published_at=published_at,
|
||||
lang=lang,
|
||||
url_hash=url_hash,
|
||||
canonical_url=self._normalize_url(url),
|
||||
)
|
||||
self.db.add(alert)
|
||||
self.db.commit()
|
||||
self.db.refresh(alert)
|
||||
return alert
|
||||
|
||||
def create_if_not_exists(
|
||||
self,
|
||||
topic_id: str,
|
||||
title: str,
|
||||
url: str,
|
||||
snippet: str = "",
|
||||
source: str = "google_alerts_rss",
|
||||
published_at: datetime = None,
|
||||
) -> Optional[AlertItemDB]:
|
||||
"""Erstellt einen Alert nur wenn URL noch nicht existiert."""
|
||||
url_hash = self._compute_url_hash(url)
|
||||
|
||||
existing = self.db.query(AlertItemDB).filter(
|
||||
AlertItemDB.url_hash == url_hash
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
return None # Duplikat
|
||||
|
||||
return self.create(
|
||||
topic_id=topic_id,
|
||||
title=title,
|
||||
url=url,
|
||||
snippet=snippet,
|
||||
source=source,
|
||||
published_at=published_at,
|
||||
)
|
||||
|
||||
# ==================== READ ====================
|
||||
|
||||
def get_by_id(self, alert_id: str) -> Optional[AlertItemDB]:
|
||||
"""Holt einen Alert nach ID."""
|
||||
return self.db.query(AlertItemDB).filter(
|
||||
AlertItemDB.id == alert_id
|
||||
).first()
|
||||
|
||||
def get_by_url_hash(self, url_hash: str) -> Optional[AlertItemDB]:
|
||||
"""Holt einen Alert nach URL-Hash."""
|
||||
return self.db.query(AlertItemDB).filter(
|
||||
AlertItemDB.url_hash == url_hash
|
||||
).first()
|
||||
|
||||
def get_inbox(
|
||||
self,
|
||||
user_id: str = None,
|
||||
topic_id: str = None,
|
||||
decision: str = None,
|
||||
status: str = None,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
) -> List[AlertItemDB]:
|
||||
"""
|
||||
Holt Inbox-Items mit Filtern.
|
||||
|
||||
Ohne decision werden KEEP und REVIEW angezeigt.
|
||||
"""
|
||||
query = self.db.query(AlertItemDB)
|
||||
|
||||
if topic_id:
|
||||
query = query.filter(AlertItemDB.topic_id == topic_id)
|
||||
|
||||
if decision:
|
||||
query = query.filter(
|
||||
AlertItemDB.relevance_decision == RelevanceDecisionEnum(decision)
|
||||
)
|
||||
else:
|
||||
# Default: KEEP und REVIEW
|
||||
query = query.filter(
|
||||
or_(
|
||||
AlertItemDB.relevance_decision == RelevanceDecisionEnum.KEEP,
|
||||
AlertItemDB.relevance_decision == RelevanceDecisionEnum.REVIEW,
|
||||
AlertItemDB.relevance_decision.is_(None)
|
||||
)
|
||||
)
|
||||
|
||||
if status:
|
||||
query = query.filter(AlertItemDB.status == AlertStatusEnum(status))
|
||||
|
||||
return query.order_by(
|
||||
AlertItemDB.relevance_score.desc().nullslast(),
|
||||
AlertItemDB.fetched_at.desc()
|
||||
).offset(offset).limit(limit).all()
|
||||
|
||||
def get_unscored(
|
||||
self,
|
||||
topic_id: str = None,
|
||||
limit: int = 100,
|
||||
) -> List[AlertItemDB]:
|
||||
"""Holt alle unbewerteten Alerts."""
|
||||
query = self.db.query(AlertItemDB).filter(
|
||||
AlertItemDB.status == AlertStatusEnum.NEW
|
||||
)
|
||||
|
||||
if topic_id:
|
||||
query = query.filter(AlertItemDB.topic_id == topic_id)
|
||||
|
||||
return query.order_by(AlertItemDB.fetched_at.desc()).limit(limit).all()
|
||||
|
||||
def get_by_topic(
|
||||
self,
|
||||
topic_id: str,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> List[AlertItemDB]:
|
||||
"""Holt alle Alerts eines Topics."""
|
||||
return self.db.query(AlertItemDB).filter(
|
||||
AlertItemDB.topic_id == topic_id
|
||||
).order_by(
|
||||
AlertItemDB.fetched_at.desc()
|
||||
).offset(offset).limit(limit).all()
|
||||
|
||||
def count_by_status(self, topic_id: str = None) -> Dict[str, int]:
|
||||
"""Zählt Alerts nach Status."""
|
||||
query = self.db.query(
|
||||
AlertItemDB.status,
|
||||
func.count(AlertItemDB.id).label('count')
|
||||
)
|
||||
|
||||
if topic_id:
|
||||
query = query.filter(AlertItemDB.topic_id == topic_id)
|
||||
|
||||
results = query.group_by(AlertItemDB.status).all()
|
||||
|
||||
return {r[0].value: r[1] for r in results}
|
||||
|
||||
def count_by_decision(self, topic_id: str = None) -> Dict[str, int]:
|
||||
"""Zählt Alerts nach Relevanz-Entscheidung."""
|
||||
query = self.db.query(
|
||||
AlertItemDB.relevance_decision,
|
||||
func.count(AlertItemDB.id).label('count')
|
||||
)
|
||||
|
||||
if topic_id:
|
||||
query = query.filter(AlertItemDB.topic_id == topic_id)
|
||||
|
||||
results = query.group_by(AlertItemDB.relevance_decision).all()
|
||||
|
||||
return {
|
||||
(r[0].value if r[0] else "unscored"): r[1]
|
||||
for r in results
|
||||
}
|
||||
|
||||
# ==================== UPDATE ====================
|
||||
|
||||
def update_scoring(
|
||||
self,
|
||||
alert_id: str,
|
||||
score: float,
|
||||
decision: str,
|
||||
reasons: List[str] = None,
|
||||
summary: str = None,
|
||||
model: str = None,
|
||||
) -> Optional[AlertItemDB]:
|
||||
"""Aktualisiert das Scoring eines Alerts."""
|
||||
alert = self.get_by_id(alert_id)
|
||||
if not alert:
|
||||
return None
|
||||
|
||||
alert.relevance_score = score
|
||||
alert.relevance_decision = RelevanceDecisionEnum(decision)
|
||||
alert.relevance_reasons = reasons or []
|
||||
alert.relevance_summary = summary
|
||||
alert.scored_by_model = model
|
||||
alert.scored_at = datetime.utcnow()
|
||||
alert.status = AlertStatusEnum.SCORED
|
||||
alert.processed_at = datetime.utcnow()
|
||||
|
||||
self.db.commit()
|
||||
self.db.refresh(alert)
|
||||
return alert
|
||||
|
||||
def update_status(
|
||||
self,
|
||||
alert_id: str,
|
||||
status: str,
|
||||
) -> Optional[AlertItemDB]:
|
||||
"""Aktualisiert den Status eines Alerts."""
|
||||
alert = self.get_by_id(alert_id)
|
||||
if not alert:
|
||||
return None
|
||||
|
||||
alert.status = AlertStatusEnum(status)
|
||||
|
||||
self.db.commit()
|
||||
self.db.refresh(alert)
|
||||
return alert
|
||||
|
||||
def mark_reviewed(
|
||||
self,
|
||||
alert_id: str,
|
||||
is_relevant: bool,
|
||||
notes: str = None,
|
||||
tags: List[str] = None,
|
||||
) -> Optional[AlertItemDB]:
|
||||
"""Markiert einen Alert als reviewed mit Feedback."""
|
||||
alert = self.get_by_id(alert_id)
|
||||
if not alert:
|
||||
return None
|
||||
|
||||
alert.status = AlertStatusEnum.REVIEWED
|
||||
alert.user_marked_relevant = is_relevant
|
||||
if notes:
|
||||
alert.user_notes = notes
|
||||
if tags:
|
||||
alert.user_tags = tags
|
||||
|
||||
self.db.commit()
|
||||
self.db.refresh(alert)
|
||||
return alert
|
||||
|
||||
def archive(self, alert_id: str) -> Optional[AlertItemDB]:
|
||||
"""Archiviert einen Alert."""
|
||||
return self.update_status(alert_id, "archived")
|
||||
|
||||
# ==================== DELETE ====================
|
||||
|
||||
def delete(self, alert_id: str) -> bool:
|
||||
"""Löscht einen Alert."""
|
||||
alert = self.get_by_id(alert_id)
|
||||
if not alert:
|
||||
return False
|
||||
|
||||
self.db.delete(alert)
|
||||
self.db.commit()
|
||||
return True
|
||||
|
||||
def delete_old(self, days: int = 90, topic_id: str = None) -> int:
|
||||
"""Löscht alte archivierte Alerts."""
|
||||
cutoff = datetime.utcnow() - timedelta(days=days)
|
||||
|
||||
query = self.db.query(AlertItemDB).filter(
|
||||
AlertItemDB.status == AlertStatusEnum.ARCHIVED,
|
||||
AlertItemDB.fetched_at < cutoff,
|
||||
)
|
||||
|
||||
if topic_id:
|
||||
query = query.filter(AlertItemDB.topic_id == topic_id)
|
||||
|
||||
count = query.delete()
|
||||
self.db.commit()
|
||||
return count
|
||||
|
||||
# ==================== FOR RSS FETCHER ====================
|
||||
|
||||
def get_existing_urls(self, topic_id: str) -> set:
|
||||
"""
|
||||
Holt alle bekannten URL-Hashes für ein Topic.
|
||||
|
||||
Wird vom RSS-Fetcher verwendet um Duplikate zu vermeiden.
|
||||
"""
|
||||
results = self.db.query(AlertItemDB.url_hash).filter(
|
||||
AlertItemDB.topic_id == topic_id
|
||||
).all()
|
||||
|
||||
return {r[0] for r in results if r[0]}
|
||||
|
||||
def create_from_alert_item(self, alert_item, topic_id: str) -> AlertItemDB:
|
||||
"""
|
||||
Erstellt einen Alert aus einem AlertItem-Objekt vom RSS-Fetcher.
|
||||
|
||||
Args:
|
||||
alert_item: AlertItem from rss_fetcher
|
||||
topic_id: Topic ID to associate with
|
||||
|
||||
Returns:
|
||||
Created AlertItemDB instance
|
||||
"""
|
||||
return self.create(
|
||||
topic_id=topic_id,
|
||||
title=alert_item.title,
|
||||
url=alert_item.url,
|
||||
snippet=alert_item.snippet or "",
|
||||
source=alert_item.source.value if hasattr(alert_item.source, 'value') else str(alert_item.source),
|
||||
published_at=alert_item.published_at,
|
||||
)
|
||||
|
||||
# ==================== HELPER ====================
|
||||
|
||||
def _compute_url_hash(self, url: str) -> str:
|
||||
"""Berechnet SHA256 Hash der normalisierten URL."""
|
||||
normalized = self._normalize_url(url)
|
||||
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
"""Normalisiert URL für Deduplizierung."""
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
|
||||
# Tracking-Parameter entfernen
|
||||
tracking_params = {
|
||||
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
||||
"fbclid", "gclid", "ref", "source"
|
||||
}
|
||||
|
||||
query_params = urllib.parse.parse_qs(parsed.query)
|
||||
cleaned_params = {k: v for k, v in query_params.items()
|
||||
if k.lower() not in tracking_params}
|
||||
|
||||
cleaned_query = urllib.parse.urlencode(cleaned_params, doseq=True)
|
||||
|
||||
# Rekonstruiere URL ohne Fragment
|
||||
normalized = urllib.parse.urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path.rstrip("/"),
|
||||
parsed.params,
|
||||
cleaned_query,
|
||||
"" # No fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
# ==================== CONVERSION ====================
|
||||
|
||||
def to_dict(self, alert: AlertItemDB) -> Dict[str, Any]:
|
||||
"""Konvertiert DB-Model zu Dictionary."""
|
||||
return {
|
||||
"id": alert.id,
|
||||
"topic_id": alert.topic_id,
|
||||
"title": alert.title,
|
||||
"url": alert.url,
|
||||
"snippet": alert.snippet,
|
||||
"source": alert.source.value,
|
||||
"lang": alert.lang,
|
||||
"published_at": alert.published_at.isoformat() if alert.published_at else None,
|
||||
"fetched_at": alert.fetched_at.isoformat() if alert.fetched_at else None,
|
||||
"status": alert.status.value,
|
||||
"relevance": {
|
||||
"score": alert.relevance_score,
|
||||
"decision": alert.relevance_decision.value if alert.relevance_decision else None,
|
||||
"reasons": alert.relevance_reasons,
|
||||
"summary": alert.relevance_summary,
|
||||
"model": alert.scored_by_model,
|
||||
"scored_at": alert.scored_at.isoformat() if alert.scored_at else None,
|
||||
},
|
||||
"user_feedback": {
|
||||
"marked_relevant": alert.user_marked_relevant,
|
||||
"tags": alert.user_tags,
|
||||
"notes": alert.user_notes,
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user