[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions

View File

@@ -0,0 +1,184 @@
"""
AI Email - Deadline Extraction
Regex-based and LLM-based deadline extraction from email content.
Extracted from ai_service.py to keep files under 500 LOC.
"""
import os
import re
import logging
from typing import List
from datetime import datetime, timedelta
import httpx
from .models import DeadlineExtraction
logger = logging.getLogger(__name__)
# LLM Gateway configuration
LLM_GATEWAY_URL = os.getenv("LLM_GATEWAY_URL", "http://localhost:8090")
async def extract_deadlines(
http_client: httpx.AsyncClient,
subject: str,
body_text: str,
) -> List[DeadlineExtraction]:
"""
Extract deadlines from email content.
Uses regex patterns first, then LLM for complex cases.
"""
deadlines = []
full_text = f"{subject}\n{body_text}" if body_text else subject
# Try regex extraction first
regex_deadlines = _extract_deadlines_regex(full_text)
deadlines.extend(regex_deadlines)
# If no regex matches, try LLM
if not deadlines and body_text:
llm_deadlines = await _extract_deadlines_llm(http_client, subject, body_text[:1000])
deadlines.extend(llm_deadlines)
return deadlines
def _extract_deadlines_regex(text: str) -> List[DeadlineExtraction]:
"""Extract deadlines using regex patterns."""
deadlines = []
now = datetime.now()
# German date patterns
patterns = [
# "bis zum 15.01.2025"
(r"bis\s+(?:zum\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
# "spaetestens am 15.01.2025"
(r"sp\u00e4testens\s+(?:am\s+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
# "Abgabetermin: 15.01.2025"
(r"(?:Abgabe|Termin|Frist)[:\s]+(\d{1,2})\.(\d{1,2})\.(\d{2,4})", True),
# "innerhalb von 14 Tagen"
(r"innerhalb\s+von\s+(\d+)\s+(?:Tagen|Wochen)", False),
# "bis Ende Januar"
(r"bis\s+(?:Ende\s+)?(Januar|Februar|M\u00e4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)", False),
]
for pattern, is_specific_date in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
try:
if is_specific_date:
day = int(match.group(1))
month = int(match.group(2))
year = int(match.group(3))
if year < 100:
year += 2000
deadline_date = datetime(year, month, day)
if deadline_date < now:
continue
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 50)
context = text[start:end].strip()
deadlines.append(DeadlineExtraction(
deadline_date=deadline_date,
description=f"Frist: {match.group(0)}",
confidence=0.85,
source_text=context,
is_firm=True,
))
else:
if "Tagen" in pattern or "Wochen" in pattern:
days = int(match.group(1))
if "Wochen" in match.group(0).lower():
days *= 7
deadline_date = now + timedelta(days=days)
deadlines.append(DeadlineExtraction(
deadline_date=deadline_date,
description=f"Relative Frist: {match.group(0)}",
confidence=0.7,
source_text=match.group(0),
is_firm=False,
))
except (ValueError, IndexError) as e:
logger.debug(f"Failed to parse date: {e}")
continue
return deadlines
async def _extract_deadlines_llm(
client: httpx.AsyncClient,
subject: str,
body_preview: str,
) -> List[DeadlineExtraction]:
"""Extract deadlines using LLM."""
try:
prompt = f"""Analysiere diese E-Mail und extrahiere alle genannten Fristen und Termine:
Betreff: {subject}
Inhalt: {body_preview}
Liste alle Fristen im folgenden Format auf (eine pro Zeile):
DATUM|BESCHREIBUNG|VERBINDLICH
Beispiel: 2025-01-15|Abgabe der Berichte|ja
Wenn keine Fristen gefunden werden, antworte mit: KEINE_FRISTEN
Antworte NUR im angegebenen Format.
"""
response = await client.post(
f"{LLM_GATEWAY_URL}/api/v1/inference",
json={
"prompt": prompt,
"playbook": "mail_analysis",
"max_tokens": 200,
},
)
if response.status_code == 200:
data = response.json()
result_text = data.get("response", "")
if "KEINE_FRISTEN" in result_text:
return []
deadlines = []
for line in result_text.strip().split("\n"):
parts = line.split("|")
if len(parts) >= 2:
try:
date_str = parts[0].strip()
deadline_date = datetime.fromisoformat(date_str)
description = parts[1].strip()
is_firm = parts[2].strip().lower() == "ja" if len(parts) > 2 else True
deadlines.append(DeadlineExtraction(
deadline_date=deadline_date,
description=description,
confidence=0.7,
source_text=line,
is_firm=is_firm,
))
except (ValueError, IndexError):
continue
return deadlines
except Exception as e:
logger.warning(f"LLM deadline extraction failed: {e}")
return []