[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/dsfa_rag_routes.py
+++ b/klausur-service/backend/dsfa_rag_routes.py
@@ -0,0 +1,461 @@
+"""
+DSFA RAG API Route Handlers.
+
+Endpoint implementations for search, sources, ingestion, stats, and init.
+"""
+
+import logging
+from typing import List, Optional
+
+from fastapi import APIRouter, HTTPException, Query, Depends
+
+from dsfa_corpus_ingestion import (
+    DSFACorpusStore,
+    DSFAQdrantService,
+    DSFASearchResult,
+    LICENSE_REGISTRY,
+    DSFA_SOURCES,
+    generate_attribution_notice,
+    get_license_label,
+    DSFA_COLLECTION,
+    chunk_document,
+)
+
+from dsfa_rag_models import (
+    DSFASourceResponse,
+    DSFAChunkResponse,
+    DSFASearchResultResponse,
+    DSFASearchResponse,
+    DSFASourceStatsResponse,
+    DSFACorpusStatsResponse,
+    IngestRequest,
+    IngestResponse,
+    LicenseInfo,
+)
+
+from dsfa_rag_embedding import (
+    get_embedding,
+    get_embeddings_batch,
+    extract_text_from_url,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/dsfa-rag", tags=["DSFA RAG"])
+
+
+# =============================================================================
+# Dependency Injection
+# =============================================================================
+
+_db_pool = None
+
+
+def set_db_pool(pool):
+    """Set the database pool for API endpoints."""
+    global _db_pool
+    _db_pool = pool
+
+
+async def get_store() -> DSFACorpusStore:
+    """Get DSFA corpus store."""
+    if _db_pool is None:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+    return DSFACorpusStore(_db_pool)
+
+
+async def get_qdrant() -> DSFAQdrantService:
+    """Get Qdrant service."""
+    return DSFAQdrantService()
+
+
+# =============================================================================
+# API Endpoints
+# =============================================================================
+
+@router.get("/search", response_model=DSFASearchResponse)
+async def search_dsfa_corpus(
+    query: str = Query(..., min_length=3, description="Search query"),
+    source_codes: Optional[List[str]] = Query(None, description="Filter by source codes"),
+    document_types: Optional[List[str]] = Query(None, description="Filter by document types (guideline, checklist, regulation)"),
+    categories: Optional[List[str]] = Query(None, description="Filter by categories (threshold_analysis, risk_assessment, mitigation)"),
+    limit: int = Query(10, ge=1, le=50, description="Maximum results"),
+    include_attribution: bool = Query(True, description="Include attribution in results"),
+    store: DSFACorpusStore = Depends(get_store),
+    qdrant: DSFAQdrantService = Depends(get_qdrant)
+):
+    """
+    Search DSFA corpus with full attribution.
+
+    Returns matching chunks with source/license information for compliance.
+    """
+    query_embedding = await get_embedding(query)
+
+    raw_results = await qdrant.search(
+        query_embedding=query_embedding,
+        source_codes=source_codes,
+        document_types=document_types,
+        categories=categories,
+        limit=limit
+    )
+
+    results = []
+    licenses_used = set()
+
+    for r in raw_results:
+        license_code = r.get("license_code", "")
+        license_info = LICENSE_REGISTRY.get(license_code, {})
+
+        result = DSFASearchResultResponse(
+            chunk_id=r.get("chunk_id", ""),
+            content=r.get("content", ""),
+            score=r.get("score", 0.0),
+            source_code=r.get("source_code", ""),
+            source_name=r.get("source_name", ""),
+            attribution_text=r.get("attribution_text", ""),
+            license_code=license_code,
+            license_name=license_info.get("name", license_code),
+            license_url=license_info.get("url"),
+            attribution_required=r.get("attribution_required", True),
+            source_url=r.get("source_url"),
+            document_type=r.get("document_type"),
+            category=r.get("category"),
+            section_title=r.get("section_title"),
+            page_number=r.get("page_number")
+        )
+        results.append(result)
+        licenses_used.add(license_code)
+
+    # Generate attribution notice
+    search_results = [
+        DSFASearchResult(
+            chunk_id=r.chunk_id,
+            content=r.content,
+            score=r.score,
+            source_code=r.source_code,
+            source_name=r.source_name,
+            attribution_text=r.attribution_text,
+            license_code=r.license_code,
+            license_url=r.license_url,
+            attribution_required=r.attribution_required,
+            source_url=r.source_url,
+            document_type=r.document_type or "",
+            category=r.category or "",
+            section_title=r.section_title,
+            page_number=r.page_number
+        )
+        for r in results
+    ]
+    attribution_notice = generate_attribution_notice(search_results) if include_attribution else ""
+
+    return DSFASearchResponse(
+        query=query,
+        results=results,
+        total_results=len(results),
+        licenses_used=list(licenses_used),
+        attribution_notice=attribution_notice
+    )
+
+
+@router.get("/sources", response_model=List[DSFASourceResponse])
+async def list_dsfa_sources(
+    document_type: Optional[str] = Query(None, description="Filter by document type"),
+    license_code: Optional[str] = Query(None, description="Filter by license"),
+    store: DSFACorpusStore = Depends(get_store)
+):
+    """List all registered DSFA sources with license info."""
+    sources = await store.list_sources()
+
+    result = []
+    for s in sources:
+        if document_type and s.get("document_type") != document_type:
+            continue
+        if license_code and s.get("license_code") != license_code:
+            continue
+
+        license_info = LICENSE_REGISTRY.get(s.get("license_code", ""), {})
+
+        result.append(DSFASourceResponse(
+            id=str(s["id"]),
+            source_code=s["source_code"],
+            name=s["name"],
+            full_name=s.get("full_name"),
+            organization=s.get("organization"),
+            source_url=s.get("source_url"),
+            license_code=s.get("license_code", ""),
+            license_name=license_info.get("name", s.get("license_code", "")),
+            license_url=license_info.get("url"),
+            attribution_required=s.get("attribution_required", True),
+            attribution_text=s.get("attribution_text", ""),
+            document_type=s.get("document_type"),
+            language=s.get("language", "de")
+        ))
+
+    return result
+
+
+@router.get("/sources/available")
+async def list_available_sources():
+    """List all available source definitions (from DSFA_SOURCES constant)."""
+    return [
+        {
+            "source_code": s["source_code"],
+            "name": s["name"],
+            "organization": s.get("organization"),
+            "license_code": s["license_code"],
+            "document_type": s.get("document_type")
+        }
+        for s in DSFA_SOURCES
+    ]
+
+
+@router.get("/sources/{source_code}", response_model=DSFASourceResponse)
+async def get_dsfa_source(
+    source_code: str,
+    store: DSFACorpusStore = Depends(get_store)
+):
+    """Get details for a specific source."""
+    source = await store.get_source_by_code(source_code)
+
+    if not source:
+        raise HTTPException(status_code=404, detail=f"Source not found: {source_code}")
+
+    license_info = LICENSE_REGISTRY.get(source.get("license_code", ""), {})
+
+    return DSFASourceResponse(
+        id=str(source["id"]),
+        source_code=source["source_code"],
+        name=source["name"],
+        full_name=source.get("full_name"),
+        organization=source.get("organization"),
+        source_url=source.get("source_url"),
+        license_code=source.get("license_code", ""),
+        license_name=license_info.get("name", source.get("license_code", "")),
+        license_url=license_info.get("url"),
+        attribution_required=source.get("attribution_required", True),
+        attribution_text=source.get("attribution_text", ""),
+        document_type=source.get("document_type"),
+        language=source.get("language", "de")
+    )
+
+
+@router.post("/sources/{source_code}/ingest", response_model=IngestResponse)
+async def ingest_dsfa_source(
+    source_code: str,
+    request: IngestRequest,
+    store: DSFACorpusStore = Depends(get_store),
+    qdrant: DSFAQdrantService = Depends(get_qdrant)
+):
+    """
+    Trigger ingestion for a specific source.
+
+    Can provide document via URL or direct text.
+    """
+    source = await store.get_source_by_code(source_code)
+    if not source:
+        raise HTTPException(status_code=404, detail=f"Source not found: {source_code}")
+
+    if not request.document_text and not request.document_url:
+        raise HTTPException(
+            status_code=400,
+            detail="Either document_text or document_url must be provided"
+        )
+
+    await qdrant.ensure_collection()
+
+    text_content = request.document_text
+    if request.document_url and not text_content:
+        logger.info(f"Extracting text from URL: {request.document_url}")
+        text_content = await extract_text_from_url(request.document_url)
+        if not text_content:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Could not extract text from URL: {request.document_url}"
+            )
+
+    if not text_content or len(text_content.strip()) < 50:
+        raise HTTPException(status_code=400, detail="Document text too short (min 50 chars)")
+
+    doc_title = request.title or f"Document for {source_code}"
+    document_id = await store.create_document(
+        source_id=str(source["id"]),
+        title=doc_title,
+        file_type="text",
+        metadata={"ingested_via": "api", "source_code": source_code}
+    )
+
+    chunks = chunk_document(text_content, source_code)
+
+    if not chunks:
+        return IngestResponse(
+            source_code=source_code,
+            document_id=document_id,
+            chunks_created=0,
+            message="Document created but no chunks generated"
+        )
+
+    chunk_texts = [chunk["content"] for chunk in chunks]
+    logger.info(f"Generating embeddings for {len(chunk_texts)} chunks...")
+    embeddings = await get_embeddings_batch(chunk_texts)
+
+    chunk_records = []
+    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+        chunk_id = await store.create_chunk(
+            document_id=document_id,
+            source_id=str(source["id"]),
+            content=chunk["content"],
+            chunk_index=i,
+            section_title=chunk.get("section_title"),
+            page_number=chunk.get("page_number"),
+            category=chunk.get("category")
+        )
+
+        chunk_records.append({
+            "chunk_id": chunk_id,
+            "document_id": document_id,
+            "source_id": str(source["id"]),
+            "content": chunk["content"],
+            "section_title": chunk.get("section_title"),
+            "source_code": source_code,
+            "source_name": source["name"],
+            "attribution_text": source["attribution_text"],
+            "license_code": source["license_code"],
+            "attribution_required": source.get("attribution_required", True),
+            "document_type": source.get("document_type", ""),
+            "category": chunk.get("category", ""),
+            "language": source.get("language", "de"),
+            "page_number": chunk.get("page_number")
+        })
+
+    indexed_count = await qdrant.index_chunks(chunk_records, embeddings)
+    await store.update_document_indexed(document_id, len(chunks))
+
+    return IngestResponse(
+        source_code=source_code,
+        document_id=document_id,
+        chunks_created=indexed_count,
+        message=f"Successfully ingested {indexed_count} chunks from document"
+    )
+
+
+@router.get("/chunks/{chunk_id}", response_model=DSFAChunkResponse)
+async def get_chunk_with_attribution(
+    chunk_id: str,
+    store: DSFACorpusStore = Depends(get_store)
+):
+    """Get single chunk with full source attribution."""
+    chunk = await store.get_chunk_with_attribution(chunk_id)
+
+    if not chunk:
+        raise HTTPException(status_code=404, detail=f"Chunk not found: {chunk_id}")
+
+    license_info = LICENSE_REGISTRY.get(chunk.get("license_code", ""), {})
+
+    return DSFAChunkResponse(
+        chunk_id=str(chunk["chunk_id"]),
+        content=chunk.get("content", ""),
+        section_title=chunk.get("section_title"),
+        page_number=chunk.get("page_number"),
+        category=chunk.get("category"),
+        document_id=str(chunk.get("document_id", "")),
+        document_title=chunk.get("document_title"),
+        source_id=str(chunk.get("source_id", "")),
+        source_code=chunk.get("source_code", ""),
+        source_name=chunk.get("source_name", ""),
+        attribution_text=chunk.get("attribution_text", ""),
+        license_code=chunk.get("license_code", ""),
+        license_name=license_info.get("name", chunk.get("license_code", "")),
+        license_url=license_info.get("url"),
+        attribution_required=chunk.get("attribution_required", True),
+        source_url=chunk.get("source_url"),
+        document_type=chunk.get("document_type")
+    )
+
+
+@router.get("/stats", response_model=DSFACorpusStatsResponse)
+async def get_corpus_stats(
+    store: DSFACorpusStore = Depends(get_store),
+    qdrant: DSFAQdrantService = Depends(get_qdrant)
+):
+    """Get corpus statistics for dashboard."""
+    source_stats = await store.get_source_stats()
+
+    total_docs = 0
+    total_chunks = 0
+    stats_response = []
+
+    for s in source_stats:
+        doc_count = s.get("document_count", 0) or 0
+        chunk_count = s.get("chunk_count", 0) or 0
+        total_docs += doc_count
+        total_chunks += chunk_count
+
+        last_indexed = s.get("last_indexed_at")
+
+        stats_response.append(DSFASourceStatsResponse(
+            source_id=str(s.get("source_id", "")),
+            source_code=s.get("source_code", ""),
+            name=s.get("name", ""),
+            organization=s.get("organization"),
+            license_code=s.get("license_code", ""),
+            document_type=s.get("document_type"),
+            document_count=doc_count,
+            chunk_count=chunk_count,
+            last_indexed_at=last_indexed.isoformat() if last_indexed else None
+        ))
+
+    qdrant_stats = await qdrant.get_stats()
+
+    return DSFACorpusStatsResponse(
+        sources=stats_response,
+        total_sources=len(source_stats),
+        total_documents=total_docs,
+        total_chunks=total_chunks,
+        qdrant_collection=DSFA_COLLECTION,
+        qdrant_points_count=qdrant_stats.get("points_count", 0),
+        qdrant_status=qdrant_stats.get("status", "unknown")
+    )
+
+
+@router.get("/licenses")
+async def list_licenses():
+    """List all supported licenses with their terms."""
+    return [
+        LicenseInfo(
+            code=code,
+            name=info.get("name", code),
+            url=info.get("url"),
+            attribution_required=info.get("attribution_required", True),
+            modification_allowed=info.get("modification_allowed", True),
+            commercial_use=info.get("commercial_use", True)
+        )
+        for code, info in LICENSE_REGISTRY.items()
+    ]
+
+
+@router.post("/init")
+async def initialize_dsfa_corpus(
+    store: DSFACorpusStore = Depends(get_store),
+    qdrant: DSFAQdrantService = Depends(get_qdrant)
+):
+    """
+    Initialize DSFA corpus.
+
+    - Creates Qdrant collection
+    - Registers all predefined sources
+    """
+    qdrant_ok = await qdrant.ensure_collection()
+
+    registered = 0
+    for source in DSFA_SOURCES:
+        try:
+            await store.register_source(source)
+            registered += 1
+        except Exception as e:
+            print(f"Error registering source {source['source_code']}: {e}")
+
+    return {
+        "qdrant_collection_created": qdrant_ok,
+        "sources_registered": registered,
+        "total_sources": len(DSFA_SOURCES)
+    }