""" AI Email - Deadline Extraction Regex-based and LLM-based deadline extraction from email content. Extracted from ai_service.py to keep files under 500 LOC. """ import os import re import logging from typing import List from datetime import datetime, timedelta import httpx from .models import DeadlineExtraction logger = logging.getLogger(__name__) # LLM Gateway configuration LLM_GATEWAY_URL = os.getenv("LLM_GATEWAY_URL", "http://localhost:8090") async def extract_deadlines( http_client: httpx.AsyncClient, subject: str, body_text: str, ) -> List[DeadlineExtraction]: """ Extract deadlines from email content. Uses regex patterns first, then LLM for complex cases. """ deadlines = [] full_text = f"{subject}\n{body_text}" if body_text else subject # Try regex extraction first regex_deadlines = _extract_deadlines_regex(full_text) deadlines.extend(regex_deadlines) # If no regex matches, try LLM if not deadlines and body_text: llm_deadlines = await _extract_deadlines_llm(http_client, subject, body_text[:1000]) deadlines.extend(llm_deadlines) return deadlines def _extract_deadlines_regex(text: str) -> List[DeadlineExtraction]: """Extract deadlines using regex patterns.""" deadlines = [] now = datetime.now() # German date patterns patterns = [ # "bis zum 15.01.2025" (r"bis\s+(?:zum\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True), # "spaetestens am 15.01.2025" (r"sp\u00e4testens\s+(?:am\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True), # "Abgabetermin: 15.01.2025" (r"(?:Abgabe|Termin|Frist)[:\s]+(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True), # "innerhalb von 14 Tagen" (r"innerhalb\s+von\s+(\d+)\s+(?:Tagen|Wochen)", False), # "bis Ende Januar" (r"bis\s+(?:Ende\s+)?(Januar|Februar|M\u00e4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)", False), ] for pattern, is_specific_date in patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: try: if is_specific_date: day = int(match.group(1)) month = int(match.group(2)) year = int(match.group(3)) if year < 100: year += 2000 deadline_date = datetime(year, month, day) if deadline_date < now: continue start = max(0, match.start() - 50) end = min(len(text), match.end() + 50) context = text[start:end].strip() deadlines.append(DeadlineExtraction( deadline_date=deadline_date, description=f"Frist: {match.group(0)}", confidence=0.85, source_text=context, is_firm=True, )) else: if "Tagen" in pattern or "Wochen" in pattern: days = int(match.group(1)) if "Wochen" in match.group(0).lower(): days *= 7 deadline_date = now + timedelta(days=days) deadlines.append(DeadlineExtraction( deadline_date=deadline_date, description=f"Relative Frist: {match.group(0)}", confidence=0.7, source_text=match.group(0), is_firm=False, )) except (ValueError, IndexError) as e: logger.debug(f"Failed to parse date: {e}") continue return deadlines async def _extract_deadlines_llm( client: httpx.AsyncClient, subject: str, body_preview: str, ) -> List[DeadlineExtraction]: """Extract deadlines using LLM.""" try: prompt = f"""Analysiere diese E-Mail und extrahiere alle genannten Fristen und Termine: Betreff: {subject} Inhalt: {body_preview} Liste alle Fristen im folgenden Format auf (eine pro Zeile): DATUM|BESCHREIBUNG|VERBINDLICH Beispiel: 2025-01-15|Abgabe der Berichte|ja Wenn keine Fristen gefunden werden, antworte mit: KEINE_FRISTEN Antworte NUR im angegebenen Format. """ response = await client.post( f"{LLM_GATEWAY_URL}/api/v1/inference", json={ "prompt": prompt, "playbook": "mail_analysis", "max_tokens": 200, }, ) if response.status_code == 200: data = response.json() result_text = data.get("response", "") if "KEINE_FRISTEN" in result_text: return [] deadlines = [] for line in result_text.strip().split("\n"): parts = line.split("|") if len(parts) >= 2: try: date_str = parts[0].strip() deadline_date = datetime.fromisoformat(date_str) description = parts[1].strip() is_firm = parts[2].strip().lower() == "ja" if len(parts) > 2 else True deadlines.append(DeadlineExtraction( deadline_date=deadline_date, description=description, confidence=0.7, source_text=line, is_firm=is_firm, )) except (ValueError, IndexError): continue return deadlines except Exception as e: logger.warning(f"LLM deadline extraction failed: {e}") return []