breakpilot-lehrer/klausur-service/backend/mail/ai_deadline.py

"""
AI Email - Deadline Extraction

Regex-based and LLM-based deadline extraction from email content.

Extracted from ai_service.py to keep files under 500 LOC.
"""

import os
import re
import logging
from typing import List
from datetime import datetime, timedelta

import httpx

from .models import DeadlineExtraction

logger = logging.getLogger(__name__)

# LLM Gateway configuration
LLM_GATEWAY_URL = os.getenv("LLM_GATEWAY_URL", "http://localhost:8090")


async def extract_deadlines(
    http_client: httpx.AsyncClient,
    subject: str,
    body_text: str,
) -> List[DeadlineExtraction]:
    """
    Extract deadlines from email content.

    Uses regex patterns first, then LLM for complex cases.
    """
    deadlines = []

    full_text = f"{subject}\n{body_text}" if body_text else subject

    # Try regex extraction first
    regex_deadlines = _extract_deadlines_regex(full_text)
    deadlines.extend(regex_deadlines)

    # If no regex matches, try LLM
    if not deadlines and body_text:
        llm_deadlines = await _extract_deadlines_llm(http_client, subject, body_text[:1000])
        deadlines.extend(llm_deadlines)

    return deadlines


def _extract_deadlines_regex(text: str) -> List[DeadlineExtraction]:
    """Extract deadlines using regex patterns."""
    deadlines = []
    now = datetime.now()

    # German date patterns
    patterns = [
        # "bis zum 15.01.2025"
        (r"bis\s+(?:zum\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
        # "spaetestens am 15.01.2025"
        (r"sp\u00e4testens\s+(?:am\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
        # "Abgabetermin: 15.01.2025"
        (r"(?:Abgabe|Termin|Frist)[:\s]+(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
        # "innerhalb von 14 Tagen"
        (r"innerhalb\s+von\s+(\d+)\s+(?:Tagen|Wochen)", False),
        # "bis Ende Januar"
        (r"bis\s+(?:Ende\s+)?(Januar|Februar|M\u00e4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)", False),
    ]

    for pattern, is_specific_date in patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)

        for match in matches:
            try:
                if is_specific_date:
                    day = int(match.group(1))
                    month = int(match.group(2))
                    year = int(match.group(3))

                    if year < 100:
                        year += 2000

                    deadline_date = datetime(year, month, day)

                    if deadline_date < now:
                        continue

                    start = max(0, match.start() - 50)
                    end = min(len(text), match.end() + 50)
                    context = text[start:end].strip()

                    deadlines.append(DeadlineExtraction(
                        deadline_date=deadline_date,
                        description=f"Frist: {match.group(0)}",
                        confidence=0.85,
                        source_text=context,
                        is_firm=True,
                    ))

                else:
                    if "Tagen" in pattern or "Wochen" in pattern:
                        days = int(match.group(1))
                        if "Wochen" in match.group(0).lower():
                            days *= 7
                        deadline_date = now + timedelta(days=days)

                        deadlines.append(DeadlineExtraction(
                            deadline_date=deadline_date,
                            description=f"Relative Frist: {match.group(0)}",
                            confidence=0.7,
                            source_text=match.group(0),
                            is_firm=False,
                        ))

            except (ValueError, IndexError) as e:
                logger.debug(f"Failed to parse date: {e}")
                continue

    return deadlines


async def _extract_deadlines_llm(
    client: httpx.AsyncClient,
    subject: str,
    body_preview: str,
) -> List[DeadlineExtraction]:
    """Extract deadlines using LLM."""
    try:
        prompt = f"""Analysiere diese E-Mail und extrahiere alle genannten Fristen und Termine:

Betreff: {subject}
Inhalt: {body_preview}

Liste alle Fristen im folgenden Format auf (eine pro Zeile):
DATUM|BESCHREIBUNG|VERBINDLICH
Beispiel: 2025-01-15|Abgabe der Berichte|ja

Wenn keine Fristen gefunden werden, antworte mit: KEINE_FRISTEN

Antworte NUR im angegebenen Format.
"""

        response = await client.post(
            f"{LLM_GATEWAY_URL}/api/v1/inference",
            json={
                "prompt": prompt,
                "playbook": "mail_analysis",
                "max_tokens": 200,
            },
        )

        if response.status_code == 200:
            data = response.json()
            result_text = data.get("response", "")

            if "KEINE_FRISTEN" in result_text:
                return []

            deadlines = []
            for line in result_text.strip().split("\n"):
                parts = line.split("|")
                if len(parts) >= 2:
                    try:
                        date_str = parts[0].strip()
                        deadline_date = datetime.fromisoformat(date_str)
                        description = parts[1].strip()
                        is_firm = parts[2].strip().lower() == "ja" if len(parts) > 2 else True

                        deadlines.append(DeadlineExtraction(
                            deadline_date=deadline_date,
                            description=description,
                            confidence=0.7,
                            source_text=line,
                            is_firm=is_firm,
                        ))
                    except (ValueError, IndexError):
                        continue

            return deadlines

    except Exception as e:
        logger.warning(f"LLM deadline extraction failed: {e}")

    return []