[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions

View File

@@ -0,0 +1,461 @@
"""
DSFA RAG API Route Handlers.
Endpoint implementations for search, sources, ingestion, stats, and init.
"""
import logging
from typing import List, Optional
from fastapi import APIRouter, HTTPException, Query, Depends
from dsfa_corpus_ingestion import (
DSFACorpusStore,
DSFAQdrantService,
DSFASearchResult,
LICENSE_REGISTRY,
DSFA_SOURCES,
generate_attribution_notice,
get_license_label,
DSFA_COLLECTION,
chunk_document,
)
from dsfa_rag_models import (
DSFASourceResponse,
DSFAChunkResponse,
DSFASearchResultResponse,
DSFASearchResponse,
DSFASourceStatsResponse,
DSFACorpusStatsResponse,
IngestRequest,
IngestResponse,
LicenseInfo,
)
from dsfa_rag_embedding import (
get_embedding,
get_embeddings_batch,
extract_text_from_url,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/dsfa-rag", tags=["DSFA RAG"])
# =============================================================================
# Dependency Injection
# =============================================================================
_db_pool = None
def set_db_pool(pool):
"""Set the database pool for API endpoints."""
global _db_pool
_db_pool = pool
async def get_store() -> DSFACorpusStore:
"""Get DSFA corpus store."""
if _db_pool is None:
raise HTTPException(status_code=503, detail="Database not initialized")
return DSFACorpusStore(_db_pool)
async def get_qdrant() -> DSFAQdrantService:
"""Get Qdrant service."""
return DSFAQdrantService()
# =============================================================================
# API Endpoints
# =============================================================================
@router.get("/search", response_model=DSFASearchResponse)
async def search_dsfa_corpus(
query: str = Query(..., min_length=3, description="Search query"),
source_codes: Optional[List[str]] = Query(None, description="Filter by source codes"),
document_types: Optional[List[str]] = Query(None, description="Filter by document types (guideline, checklist, regulation)"),
categories: Optional[List[str]] = Query(None, description="Filter by categories (threshold_analysis, risk_assessment, mitigation)"),
limit: int = Query(10, ge=1, le=50, description="Maximum results"),
include_attribution: bool = Query(True, description="Include attribution in results"),
store: DSFACorpusStore = Depends(get_store),
qdrant: DSFAQdrantService = Depends(get_qdrant)
):
"""
Search DSFA corpus with full attribution.
Returns matching chunks with source/license information for compliance.
"""
query_embedding = await get_embedding(query)
raw_results = await qdrant.search(
query_embedding=query_embedding,
source_codes=source_codes,
document_types=document_types,
categories=categories,
limit=limit
)
results = []
licenses_used = set()
for r in raw_results:
license_code = r.get("license_code", "")
license_info = LICENSE_REGISTRY.get(license_code, {})
result = DSFASearchResultResponse(
chunk_id=r.get("chunk_id", ""),
content=r.get("content", ""),
score=r.get("score", 0.0),
source_code=r.get("source_code", ""),
source_name=r.get("source_name", ""),
attribution_text=r.get("attribution_text", ""),
license_code=license_code,
license_name=license_info.get("name", license_code),
license_url=license_info.get("url"),
attribution_required=r.get("attribution_required", True),
source_url=r.get("source_url"),
document_type=r.get("document_type"),
category=r.get("category"),
section_title=r.get("section_title"),
page_number=r.get("page_number")
)
results.append(result)
licenses_used.add(license_code)
# Generate attribution notice
search_results = [
DSFASearchResult(
chunk_id=r.chunk_id,
content=r.content,
score=r.score,
source_code=r.source_code,
source_name=r.source_name,
attribution_text=r.attribution_text,
license_code=r.license_code,
license_url=r.license_url,
attribution_required=r.attribution_required,
source_url=r.source_url,
document_type=r.document_type or "",
category=r.category or "",
section_title=r.section_title,
page_number=r.page_number
)
for r in results
]
attribution_notice = generate_attribution_notice(search_results) if include_attribution else ""
return DSFASearchResponse(
query=query,
results=results,
total_results=len(results),
licenses_used=list(licenses_used),
attribution_notice=attribution_notice
)
@router.get("/sources", response_model=List[DSFASourceResponse])
async def list_dsfa_sources(
document_type: Optional[str] = Query(None, description="Filter by document type"),
license_code: Optional[str] = Query(None, description="Filter by license"),
store: DSFACorpusStore = Depends(get_store)
):
"""List all registered DSFA sources with license info."""
sources = await store.list_sources()
result = []
for s in sources:
if document_type and s.get("document_type") != document_type:
continue
if license_code and s.get("license_code") != license_code:
continue
license_info = LICENSE_REGISTRY.get(s.get("license_code", ""), {})
result.append(DSFASourceResponse(
id=str(s["id"]),
source_code=s["source_code"],
name=s["name"],
full_name=s.get("full_name"),
organization=s.get("organization"),
source_url=s.get("source_url"),
license_code=s.get("license_code", ""),
license_name=license_info.get("name", s.get("license_code", "")),
license_url=license_info.get("url"),
attribution_required=s.get("attribution_required", True),
attribution_text=s.get("attribution_text", ""),
document_type=s.get("document_type"),
language=s.get("language", "de")
))
return result
@router.get("/sources/available")
async def list_available_sources():
"""List all available source definitions (from DSFA_SOURCES constant)."""
return [
{
"source_code": s["source_code"],
"name": s["name"],
"organization": s.get("organization"),
"license_code": s["license_code"],
"document_type": s.get("document_type")
}
for s in DSFA_SOURCES
]
@router.get("/sources/{source_code}", response_model=DSFASourceResponse)
async def get_dsfa_source(
source_code: str,
store: DSFACorpusStore = Depends(get_store)
):
"""Get details for a specific source."""
source = await store.get_source_by_code(source_code)
if not source:
raise HTTPException(status_code=404, detail=f"Source not found: {source_code}")
license_info = LICENSE_REGISTRY.get(source.get("license_code", ""), {})
return DSFASourceResponse(
id=str(source["id"]),
source_code=source["source_code"],
name=source["name"],
full_name=source.get("full_name"),
organization=source.get("organization"),
source_url=source.get("source_url"),
license_code=source.get("license_code", ""),
license_name=license_info.get("name", source.get("license_code", "")),
license_url=license_info.get("url"),
attribution_required=source.get("attribution_required", True),
attribution_text=source.get("attribution_text", ""),
document_type=source.get("document_type"),
language=source.get("language", "de")
)
@router.post("/sources/{source_code}/ingest", response_model=IngestResponse)
async def ingest_dsfa_source(
source_code: str,
request: IngestRequest,
store: DSFACorpusStore = Depends(get_store),
qdrant: DSFAQdrantService = Depends(get_qdrant)
):
"""
Trigger ingestion for a specific source.
Can provide document via URL or direct text.
"""
source = await store.get_source_by_code(source_code)
if not source:
raise HTTPException(status_code=404, detail=f"Source not found: {source_code}")
if not request.document_text and not request.document_url:
raise HTTPException(
status_code=400,
detail="Either document_text or document_url must be provided"
)
await qdrant.ensure_collection()
text_content = request.document_text
if request.document_url and not text_content:
logger.info(f"Extracting text from URL: {request.document_url}")
text_content = await extract_text_from_url(request.document_url)
if not text_content:
raise HTTPException(
status_code=400,
detail=f"Could not extract text from URL: {request.document_url}"
)
if not text_content or len(text_content.strip()) < 50:
raise HTTPException(status_code=400, detail="Document text too short (min 50 chars)")
doc_title = request.title or f"Document for {source_code}"
document_id = await store.create_document(
source_id=str(source["id"]),
title=doc_title,
file_type="text",
metadata={"ingested_via": "api", "source_code": source_code}
)
chunks = chunk_document(text_content, source_code)
if not chunks:
return IngestResponse(
source_code=source_code,
document_id=document_id,
chunks_created=0,
message="Document created but no chunks generated"
)
chunk_texts = [chunk["content"] for chunk in chunks]
logger.info(f"Generating embeddings for {len(chunk_texts)} chunks...")
embeddings = await get_embeddings_batch(chunk_texts)
chunk_records = []
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
chunk_id = await store.create_chunk(
document_id=document_id,
source_id=str(source["id"]),
content=chunk["content"],
chunk_index=i,
section_title=chunk.get("section_title"),
page_number=chunk.get("page_number"),
category=chunk.get("category")
)
chunk_records.append({
"chunk_id": chunk_id,
"document_id": document_id,
"source_id": str(source["id"]),
"content": chunk["content"],
"section_title": chunk.get("section_title"),
"source_code": source_code,
"source_name": source["name"],
"attribution_text": source["attribution_text"],
"license_code": source["license_code"],
"attribution_required": source.get("attribution_required", True),
"document_type": source.get("document_type", ""),
"category": chunk.get("category", ""),
"language": source.get("language", "de"),
"page_number": chunk.get("page_number")
})
indexed_count = await qdrant.index_chunks(chunk_records, embeddings)
await store.update_document_indexed(document_id, len(chunks))
return IngestResponse(
source_code=source_code,
document_id=document_id,
chunks_created=indexed_count,
message=f"Successfully ingested {indexed_count} chunks from document"
)
@router.get("/chunks/{chunk_id}", response_model=DSFAChunkResponse)
async def get_chunk_with_attribution(
chunk_id: str,
store: DSFACorpusStore = Depends(get_store)
):
"""Get single chunk with full source attribution."""
chunk = await store.get_chunk_with_attribution(chunk_id)
if not chunk:
raise HTTPException(status_code=404, detail=f"Chunk not found: {chunk_id}")
license_info = LICENSE_REGISTRY.get(chunk.get("license_code", ""), {})
return DSFAChunkResponse(
chunk_id=str(chunk["chunk_id"]),
content=chunk.get("content", ""),
section_title=chunk.get("section_title"),
page_number=chunk.get("page_number"),
category=chunk.get("category"),
document_id=str(chunk.get("document_id", "")),
document_title=chunk.get("document_title"),
source_id=str(chunk.get("source_id", "")),
source_code=chunk.get("source_code", ""),
source_name=chunk.get("source_name", ""),
attribution_text=chunk.get("attribution_text", ""),
license_code=chunk.get("license_code", ""),
license_name=license_info.get("name", chunk.get("license_code", "")),
license_url=license_info.get("url"),
attribution_required=chunk.get("attribution_required", True),
source_url=chunk.get("source_url"),
document_type=chunk.get("document_type")
)
@router.get("/stats", response_model=DSFACorpusStatsResponse)
async def get_corpus_stats(
store: DSFACorpusStore = Depends(get_store),
qdrant: DSFAQdrantService = Depends(get_qdrant)
):
"""Get corpus statistics for dashboard."""
source_stats = await store.get_source_stats()
total_docs = 0
total_chunks = 0
stats_response = []
for s in source_stats:
doc_count = s.get("document_count", 0) or 0
chunk_count = s.get("chunk_count", 0) or 0
total_docs += doc_count
total_chunks += chunk_count
last_indexed = s.get("last_indexed_at")
stats_response.append(DSFASourceStatsResponse(
source_id=str(s.get("source_id", "")),
source_code=s.get("source_code", ""),
name=s.get("name", ""),
organization=s.get("organization"),
license_code=s.get("license_code", ""),
document_type=s.get("document_type"),
document_count=doc_count,
chunk_count=chunk_count,
last_indexed_at=last_indexed.isoformat() if last_indexed else None
))
qdrant_stats = await qdrant.get_stats()
return DSFACorpusStatsResponse(
sources=stats_response,
total_sources=len(source_stats),
total_documents=total_docs,
total_chunks=total_chunks,
qdrant_collection=DSFA_COLLECTION,
qdrant_points_count=qdrant_stats.get("points_count", 0),
qdrant_status=qdrant_stats.get("status", "unknown")
)
@router.get("/licenses")
async def list_licenses():
"""List all supported licenses with their terms."""
return [
LicenseInfo(
code=code,
name=info.get("name", code),
url=info.get("url"),
attribution_required=info.get("attribution_required", True),
modification_allowed=info.get("modification_allowed", True),
commercial_use=info.get("commercial_use", True)
)
for code, info in LICENSE_REGISTRY.items()
]
@router.post("/init")
async def initialize_dsfa_corpus(
store: DSFACorpusStore = Depends(get_store),
qdrant: DSFAQdrantService = Depends(get_qdrant)
):
"""
Initialize DSFA corpus.
- Creates Qdrant collection
- Registers all predefined sources
"""
qdrant_ok = await qdrant.ensure_collection()
registered = 0
for source in DSFA_SOURCES:
try:
await store.register_source(source)
registered += 1
except Exception as e:
print(f"Error registering source {source['source_code']}: {e}")
return {
"qdrant_collection_created": qdrant_ok,
"sources_registered": registered,
"total_sources": len(DSFA_SOURCES)
}