backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
185 lines
5.7 KiB
Python
185 lines
5.7 KiB
Python
"""
|
|
AI Email - Deadline Extraction
|
|
|
|
Regex-based and LLM-based deadline extraction from email content.
|
|
|
|
Extracted from ai_service.py to keep files under 500 LOC.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import logging
|
|
from typing import List
|
|
from datetime import datetime, timedelta
|
|
|
|
import httpx
|
|
|
|
from .models import DeadlineExtraction
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# LLM Gateway configuration
|
|
LLM_GATEWAY_URL = os.getenv("LLM_GATEWAY_URL", "http://localhost:8090")
|
|
|
|
|
|
async def extract_deadlines(
|
|
http_client: httpx.AsyncClient,
|
|
subject: str,
|
|
body_text: str,
|
|
) -> List[DeadlineExtraction]:
|
|
"""
|
|
Extract deadlines from email content.
|
|
|
|
Uses regex patterns first, then LLM for complex cases.
|
|
"""
|
|
deadlines = []
|
|
|
|
full_text = f"{subject}\n{body_text}" if body_text else subject
|
|
|
|
# Try regex extraction first
|
|
regex_deadlines = _extract_deadlines_regex(full_text)
|
|
deadlines.extend(regex_deadlines)
|
|
|
|
# If no regex matches, try LLM
|
|
if not deadlines and body_text:
|
|
llm_deadlines = await _extract_deadlines_llm(http_client, subject, body_text[:1000])
|
|
deadlines.extend(llm_deadlines)
|
|
|
|
return deadlines
|
|
|
|
|
|
def _extract_deadlines_regex(text: str) -> List[DeadlineExtraction]:
|
|
"""Extract deadlines using regex patterns."""
|
|
deadlines = []
|
|
now = datetime.now()
|
|
|
|
# German date patterns
|
|
patterns = [
|
|
# "bis zum 15.01.2025"
|
|
(r"bis\s+(?:zum\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
|
|
# "spaetestens am 15.01.2025"
|
|
(r"sp\u00e4testens\s+(?:am\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
|
|
# "Abgabetermin: 15.01.2025"
|
|
(r"(?:Abgabe|Termin|Frist)[:\s]+(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
|
|
# "innerhalb von 14 Tagen"
|
|
(r"innerhalb\s+von\s+(\d+)\s+(?:Tagen|Wochen)", False),
|
|
# "bis Ende Januar"
|
|
(r"bis\s+(?:Ende\s+)?(Januar|Februar|M\u00e4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)", False),
|
|
]
|
|
|
|
for pattern, is_specific_date in patterns:
|
|
matches = re.finditer(pattern, text, re.IGNORECASE)
|
|
|
|
for match in matches:
|
|
try:
|
|
if is_specific_date:
|
|
day = int(match.group(1))
|
|
month = int(match.group(2))
|
|
year = int(match.group(3))
|
|
|
|
if year < 100:
|
|
year += 2000
|
|
|
|
deadline_date = datetime(year, month, day)
|
|
|
|
if deadline_date < now:
|
|
continue
|
|
|
|
start = max(0, match.start() - 50)
|
|
end = min(len(text), match.end() + 50)
|
|
context = text[start:end].strip()
|
|
|
|
deadlines.append(DeadlineExtraction(
|
|
deadline_date=deadline_date,
|
|
description=f"Frist: {match.group(0)}",
|
|
confidence=0.85,
|
|
source_text=context,
|
|
is_firm=True,
|
|
))
|
|
|
|
else:
|
|
if "Tagen" in pattern or "Wochen" in pattern:
|
|
days = int(match.group(1))
|
|
if "Wochen" in match.group(0).lower():
|
|
days *= 7
|
|
deadline_date = now + timedelta(days=days)
|
|
|
|
deadlines.append(DeadlineExtraction(
|
|
deadline_date=deadline_date,
|
|
description=f"Relative Frist: {match.group(0)}",
|
|
confidence=0.7,
|
|
source_text=match.group(0),
|
|
is_firm=False,
|
|
))
|
|
|
|
except (ValueError, IndexError) as e:
|
|
logger.debug(f"Failed to parse date: {e}")
|
|
continue
|
|
|
|
return deadlines
|
|
|
|
|
|
async def _extract_deadlines_llm(
|
|
client: httpx.AsyncClient,
|
|
subject: str,
|
|
body_preview: str,
|
|
) -> List[DeadlineExtraction]:
|
|
"""Extract deadlines using LLM."""
|
|
try:
|
|
prompt = f"""Analysiere diese E-Mail und extrahiere alle genannten Fristen und Termine:
|
|
|
|
Betreff: {subject}
|
|
Inhalt: {body_preview}
|
|
|
|
Liste alle Fristen im folgenden Format auf (eine pro Zeile):
|
|
DATUM|BESCHREIBUNG|VERBINDLICH
|
|
Beispiel: 2025-01-15|Abgabe der Berichte|ja
|
|
|
|
Wenn keine Fristen gefunden werden, antworte mit: KEINE_FRISTEN
|
|
|
|
Antworte NUR im angegebenen Format.
|
|
"""
|
|
|
|
response = await client.post(
|
|
f"{LLM_GATEWAY_URL}/api/v1/inference",
|
|
json={
|
|
"prompt": prompt,
|
|
"playbook": "mail_analysis",
|
|
"max_tokens": 200,
|
|
},
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
result_text = data.get("response", "")
|
|
|
|
if "KEINE_FRISTEN" in result_text:
|
|
return []
|
|
|
|
deadlines = []
|
|
for line in result_text.strip().split("\n"):
|
|
parts = line.split("|")
|
|
if len(parts) >= 2:
|
|
try:
|
|
date_str = parts[0].strip()
|
|
deadline_date = datetime.fromisoformat(date_str)
|
|
description = parts[1].strip()
|
|
is_firm = parts[2].strip().lower() == "ja" if len(parts) > 2 else True
|
|
|
|
deadlines.append(DeadlineExtraction(
|
|
deadline_date=deadline_date,
|
|
description=description,
|
|
confidence=0.7,
|
|
source_text=line,
|
|
is_firm=is_firm,
|
|
))
|
|
except (ValueError, IndexError):
|
|
continue
|
|
|
|
return deadlines
|
|
|
|
except Exception as e:
|
|
logger.warning(f"LLM deadline extraction failed: {e}")
|
|
|
|
return []
|