Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
174 lines
4.4 KiB
Python
174 lines
4.4 KiB
Python
"""
|
|
Legal Crawler API Routes.
|
|
|
|
Endpoints für das Crawlen und Abrufen von rechtlichen Bildungsinhalten.
|
|
"""
|
|
|
|
import logging
|
|
import asyncio
|
|
from typing import List, Optional
|
|
|
|
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
|
from pydantic import BaseModel
|
|
|
|
from ..services.legal_crawler import get_legal_crawler, LegalCrawler
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/legal-crawler", tags=["legal-crawler"])
|
|
|
|
|
|
class CrawlStatusResponse(BaseModel):
|
|
"""Response für Crawl-Status."""
|
|
status: str
|
|
message: str
|
|
stats: Optional[dict] = None
|
|
|
|
|
|
class LegalDocumentResponse(BaseModel):
|
|
"""Response für ein rechtliches Dokument."""
|
|
id: str
|
|
url: str
|
|
title: str
|
|
law_name: Optional[str]
|
|
state: Optional[str]
|
|
paragraphs: Optional[list]
|
|
last_crawled_at: Optional[str]
|
|
|
|
|
|
class LegalReferenceFromDB(BaseModel):
|
|
"""Rechtliche Referenz aus der DB."""
|
|
law: str
|
|
url: str
|
|
state: Optional[str]
|
|
title: str
|
|
paragraphs: list
|
|
|
|
|
|
# Globaler Status für laufenden Crawl
|
|
_crawl_status = {
|
|
"running": False,
|
|
"last_run": None,
|
|
"last_stats": None,
|
|
}
|
|
|
|
|
|
async def _run_crawl(db_pool):
|
|
"""Führt den Crawl asynchron durch."""
|
|
global _crawl_status
|
|
_crawl_status["running"] = True
|
|
|
|
try:
|
|
crawler = get_legal_crawler()
|
|
stats = await crawler.crawl_legal_seeds(db_pool)
|
|
_crawl_status["last_stats"] = stats
|
|
_crawl_status["last_run"] = "completed"
|
|
except Exception as e:
|
|
logger.error(f"Crawl-Fehler: {e}")
|
|
_crawl_status["last_run"] = f"error: {str(e)}"
|
|
finally:
|
|
_crawl_status["running"] = False
|
|
|
|
|
|
@router.post("/start", response_model=CrawlStatusResponse)
|
|
async def start_crawl(background_tasks: BackgroundTasks):
|
|
"""
|
|
Startet einen neuen Crawl für alle Legal-Seeds.
|
|
|
|
Der Crawl läuft im Hintergrund und kann über /status abgefragt werden.
|
|
"""
|
|
global _crawl_status
|
|
|
|
if _crawl_status["running"]:
|
|
return CrawlStatusResponse(
|
|
status="already_running",
|
|
message="Ein Crawl läuft bereits. Bitte warten Sie, bis er abgeschlossen ist."
|
|
)
|
|
|
|
# Hinweis: In Produktion würde hier der DB-Pool übergeben werden
|
|
# Für jetzt nur Status setzen
|
|
_crawl_status["running"] = True
|
|
_crawl_status["last_run"] = "started"
|
|
|
|
return CrawlStatusResponse(
|
|
status="started",
|
|
message="Crawl wurde gestartet. Nutzen Sie /status um den Fortschritt zu prüfen."
|
|
)
|
|
|
|
|
|
@router.get("/status", response_model=CrawlStatusResponse)
|
|
async def get_crawl_status():
|
|
"""Gibt den aktuellen Crawl-Status zurück."""
|
|
return CrawlStatusResponse(
|
|
status="running" if _crawl_status["running"] else "idle",
|
|
message=_crawl_status.get("last_run") or "Noch nie gecrawlt",
|
|
stats=_crawl_status.get("last_stats")
|
|
)
|
|
|
|
|
|
@router.get("/documents", response_model=List[LegalDocumentResponse])
|
|
async def get_legal_documents(
|
|
state: Optional[str] = None,
|
|
doc_type: Optional[str] = None,
|
|
limit: int = 50
|
|
):
|
|
"""
|
|
Gibt gecrawlte rechtliche Dokumente zurück.
|
|
|
|
Args:
|
|
state: Filter nach Bundesland (z.B. "NW", "BY")
|
|
doc_type: Filter nach Dokumenttyp (z.B. "schulgesetz")
|
|
limit: Max. Anzahl Dokumente
|
|
|
|
Returns:
|
|
Liste von LegalDocumentResponse
|
|
"""
|
|
# TODO: DB-Query implementieren wenn DB-Pool verfügbar
|
|
# Für jetzt leere Liste zurückgeben
|
|
return []
|
|
|
|
|
|
@router.get("/references/{state}")
|
|
async def get_legal_references_for_state(state: str):
|
|
"""
|
|
Gibt rechtliche Referenzen für ein Bundesland zurück.
|
|
|
|
Dies ist der Endpoint, den der Communication-Service nutzt.
|
|
|
|
Args:
|
|
state: Bundesland-Kürzel (z.B. "NW", "BY", "BE")
|
|
|
|
Returns:
|
|
Dict mit Schulgesetz-Informationen und Paragraphen
|
|
"""
|
|
# TODO: Aus DB laden
|
|
# Mapping von state-Kürzeln zu DB-Werten
|
|
state_mapping = {
|
|
"NRW": "NW",
|
|
"NW": "NW",
|
|
"BY": "BY",
|
|
"BW": "BW",
|
|
"BE": "BE",
|
|
"BB": "BB",
|
|
"HB": "HB",
|
|
"HH": "HH",
|
|
"HE": "HE",
|
|
"MV": "MV",
|
|
"NI": "NI",
|
|
"RP": "RP",
|
|
"SL": "SL",
|
|
"SN": "SN",
|
|
"ST": "ST",
|
|
"SH": "SH",
|
|
"TH": "TH",
|
|
}
|
|
|
|
db_state = state_mapping.get(state.upper(), state.upper())
|
|
|
|
# Placeholder - später aus DB
|
|
return {
|
|
"state": state,
|
|
"documents": [],
|
|
"message": "Dokumente werden nach dem ersten Crawl verfügbar sein"
|
|
}
|