[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/mail/ai_deadline.py
+++ b/klausur-service/backend/mail/ai_deadline.py
@@ -0,0 +1,184 @@
+"""
+AI Email - Deadline Extraction
+
+Regex-based and LLM-based deadline extraction from email content.
+
+Extracted from ai_service.py to keep files under 500 LOC.
+"""
+
+import os
+import re
+import logging
+from typing import List
+from datetime import datetime, timedelta
+
+import httpx
+
+from .models import DeadlineExtraction
+
+logger = logging.getLogger(__name__)
+
+# LLM Gateway configuration
+LLM_GATEWAY_URL = os.getenv("LLM_GATEWAY_URL", "http://localhost:8090")
+
+
+async def extract_deadlines(
+    http_client: httpx.AsyncClient,
+    subject: str,
+    body_text: str,
+) -> List[DeadlineExtraction]:
+    """
+    Extract deadlines from email content.
+
+    Uses regex patterns first, then LLM for complex cases.
+    """
+    deadlines = []
+
+    full_text = f"{subject}\n{body_text}" if body_text else subject
+
+    # Try regex extraction first
+    regex_deadlines = _extract_deadlines_regex(full_text)
+    deadlines.extend(regex_deadlines)
+
+    # If no regex matches, try LLM
+    if not deadlines and body_text:
+        llm_deadlines = await _extract_deadlines_llm(http_client, subject, body_text[:1000])
+        deadlines.extend(llm_deadlines)
+
+    return deadlines
+
+
+def _extract_deadlines_regex(text: str) -> List[DeadlineExtraction]:
+    """Extract deadlines using regex patterns."""
+    deadlines = []
+    now = datetime.now()
+
+    # German date patterns
+    patterns = [
+        # "bis zum 15.01.2025"
+        (r"bis\s+(?:zum\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
+        # "spaetestens am 15.01.2025"
+        (r"sp\u00e4testens\s+(?:am\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
+        # "Abgabetermin: 15.01.2025"
+        (r"(?:Abgabe|Termin|Frist)[:\s]+(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
+        # "innerhalb von 14 Tagen"
+        (r"innerhalb\s+von\s+(\d+)\s+(?:Tagen|Wochen)", False),
+        # "bis Ende Januar"
+        (r"bis\s+(?:Ende\s+)?(Januar|Februar|M\u00e4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)", False),
+    ]
+
+    for pattern, is_specific_date in patterns:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+
+        for match in matches:
+            try:
+                if is_specific_date:
+                    day = int(match.group(1))
+                    month = int(match.group(2))
+                    year = int(match.group(3))
+
+                    if year < 100:
+                        year += 2000
+
+                    deadline_date = datetime(year, month, day)
+
+                    if deadline_date < now:
+                        continue
+
+                    start = max(0, match.start() - 50)
+                    end = min(len(text), match.end() + 50)
+                    context = text[start:end].strip()
+
+                    deadlines.append(DeadlineExtraction(
+                        deadline_date=deadline_date,
+                        description=f"Frist: {match.group(0)}",
+                        confidence=0.85,
+                        source_text=context,
+                        is_firm=True,
+                    ))
+
+                else:
+                    if "Tagen" in pattern or "Wochen" in pattern:
+                        days = int(match.group(1))
+                        if "Wochen" in match.group(0).lower():
+                            days *= 7
+                        deadline_date = now + timedelta(days=days)
+
+                        deadlines.append(DeadlineExtraction(
+                            deadline_date=deadline_date,
+                            description=f"Relative Frist: {match.group(0)}",
+                            confidence=0.7,
+                            source_text=match.group(0),
+                            is_firm=False,
+                        ))
+
+            except (ValueError, IndexError) as e:
+                logger.debug(f"Failed to parse date: {e}")
+                continue
+
+    return deadlines
+
+
+async def _extract_deadlines_llm(
+    client: httpx.AsyncClient,
+    subject: str,
+    body_preview: str,
+) -> List[DeadlineExtraction]:
+    """Extract deadlines using LLM."""
+    try:
+        prompt = f"""Analysiere diese E-Mail und extrahiere alle genannten Fristen und Termine:
+
+Betreff: {subject}
+Inhalt: {body_preview}
+
+Liste alle Fristen im folgenden Format auf (eine pro Zeile):
+DATUM|BESCHREIBUNG|VERBINDLICH
+Beispiel: 2025-01-15|Abgabe der Berichte|ja
+
+Wenn keine Fristen gefunden werden, antworte mit: KEINE_FRISTEN
+
+Antworte NUR im angegebenen Format.
+"""
+
+        response = await client.post(
+            f"{LLM_GATEWAY_URL}/api/v1/inference",
+            json={
+                "prompt": prompt,
+                "playbook": "mail_analysis",
+                "max_tokens": 200,
+            },
+        )
+
+        if response.status_code == 200:
+            data = response.json()
+            result_text = data.get("response", "")
+
+            if "KEINE_FRISTEN" in result_text:
+                return []
+
+            deadlines = []
+            for line in result_text.strip().split("\n"):
+                parts = line.split("|")
+                if len(parts) >= 2:
+                    try:
+                        date_str = parts[0].strip()
+                        deadline_date = datetime.fromisoformat(date_str)
+                        description = parts[1].strip()
+                        is_firm = parts[2].strip().lower() == "ja" if len(parts) > 2 else True
+
+                        deadlines.append(DeadlineExtraction(
+                            deadline_date=deadline_date,
+                            description=description,
+                            confidence=0.7,
+                            source_text=line,
+                            is_firm=is_firm,
+                        ))
+                    except (ValueError, IndexError):
+                        continue
+
+            return deadlines
+
+    except Exception as e:
+        logger.warning(f"LLM deadline extraction failed: {e}")
+
+    return []