feat(klausur-service): Add Tesseract OCR, DSFA RAG, TrOCR, grid detection and vocab session store
New modules: - tesseract_vocab_extractor.py: Bounding-box OCR with multi-PSM pipeline - grid_detection_service.py: CV-based grid/table detection for worksheets - vocab_session_store.py: PostgreSQL persistence for vocab sessions - trocr_api.py: TrOCR handwriting recognition endpoint - dsfa_rag_api.py + dsfa_corpus_ingestion.py: DSFA RAG corpus search Changes: - Dockerfile: Install tesseract-ocr + deu/eng language packs - requirements.txt: Add PyMuPDF, pytesseract, Pillow - main.py: Register new routers, init DB pools + Qdrant collections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
715
klausur-service/backend/dsfa_rag_api.py
Normal file
715
klausur-service/backend/dsfa_rag_api.py
Normal file
@@ -0,0 +1,715 @@
|
||||
"""
|
||||
DSFA RAG API Endpoints.
|
||||
|
||||
Provides REST API for searching DSFA corpus with full source attribution.
|
||||
|
||||
Endpoints:
|
||||
- GET /api/v1/dsfa-rag/search - Semantic search with attribution
|
||||
- GET /api/v1/dsfa-rag/sources - List all registered sources
|
||||
- POST /api/v1/dsfa-rag/sources/{code}/ingest - Trigger source ingestion
|
||||
- GET /api/v1/dsfa-rag/chunks/{id} - Get single chunk with attribution
|
||||
- GET /api/v1/dsfa-rag/stats - Get corpus statistics
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, HTTPException, Query, Depends
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Embedding service configuration
|
||||
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://172.18.0.13:8087")
|
||||
|
||||
# Import from ingestion module
|
||||
from dsfa_corpus_ingestion import (
|
||||
DSFACorpusStore,
|
||||
DSFAQdrantService,
|
||||
DSFASearchResult,
|
||||
LICENSE_REGISTRY,
|
||||
DSFA_SOURCES,
|
||||
generate_attribution_notice,
|
||||
get_license_label,
|
||||
DSFA_COLLECTION,
|
||||
chunk_document
|
||||
)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/dsfa-rag", tags=["DSFA RAG"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pydantic Models
|
||||
# =============================================================================
|
||||
|
||||
class DSFASourceResponse(BaseModel):
|
||||
"""Response model for DSFA source."""
|
||||
id: str
|
||||
source_code: str
|
||||
name: str
|
||||
full_name: Optional[str] = None
|
||||
organization: Optional[str] = None
|
||||
source_url: Optional[str] = None
|
||||
license_code: str
|
||||
license_name: str
|
||||
license_url: Optional[str] = None
|
||||
attribution_required: bool
|
||||
attribution_text: str
|
||||
document_type: Optional[str] = None
|
||||
language: str = "de"
|
||||
|
||||
|
||||
class DSFAChunkResponse(BaseModel):
|
||||
"""Response model for a single chunk with attribution."""
|
||||
chunk_id: str
|
||||
content: str
|
||||
section_title: Optional[str] = None
|
||||
page_number: Optional[int] = None
|
||||
category: Optional[str] = None
|
||||
|
||||
# Document info
|
||||
document_id: str
|
||||
document_title: Optional[str] = None
|
||||
|
||||
# Attribution (always included)
|
||||
source_id: str
|
||||
source_code: str
|
||||
source_name: str
|
||||
attribution_text: str
|
||||
license_code: str
|
||||
license_name: str
|
||||
license_url: Optional[str] = None
|
||||
attribution_required: bool
|
||||
source_url: Optional[str] = None
|
||||
document_type: Optional[str] = None
|
||||
|
||||
|
||||
class DSFASearchResultResponse(BaseModel):
|
||||
"""Response model for search result."""
|
||||
chunk_id: str
|
||||
content: str
|
||||
score: float
|
||||
|
||||
# Attribution
|
||||
source_code: str
|
||||
source_name: str
|
||||
attribution_text: str
|
||||
license_code: str
|
||||
license_name: str
|
||||
license_url: Optional[str] = None
|
||||
attribution_required: bool
|
||||
source_url: Optional[str] = None
|
||||
|
||||
# Metadata
|
||||
document_type: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
section_title: Optional[str] = None
|
||||
page_number: Optional[int] = None
|
||||
|
||||
|
||||
class DSFASearchResponse(BaseModel):
|
||||
"""Response model for search endpoint."""
|
||||
query: str
|
||||
results: List[DSFASearchResultResponse]
|
||||
total_results: int
|
||||
|
||||
# Aggregated licenses for footer
|
||||
licenses_used: List[str]
|
||||
attribution_notice: str
|
||||
|
||||
|
||||
class DSFASourceStatsResponse(BaseModel):
|
||||
"""Response model for source statistics."""
|
||||
source_id: str
|
||||
source_code: str
|
||||
name: str
|
||||
organization: Optional[str] = None
|
||||
license_code: str
|
||||
document_type: Optional[str] = None
|
||||
document_count: int
|
||||
chunk_count: int
|
||||
last_indexed_at: Optional[str] = None
|
||||
|
||||
|
||||
class DSFACorpusStatsResponse(BaseModel):
|
||||
"""Response model for corpus statistics."""
|
||||
sources: List[DSFASourceStatsResponse]
|
||||
total_sources: int
|
||||
total_documents: int
|
||||
total_chunks: int
|
||||
qdrant_collection: str
|
||||
qdrant_points_count: int
|
||||
qdrant_status: str
|
||||
|
||||
|
||||
class IngestRequest(BaseModel):
|
||||
"""Request model for ingestion."""
|
||||
document_url: Optional[str] = None
|
||||
document_text: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
|
||||
|
||||
class IngestResponse(BaseModel):
|
||||
"""Response model for ingestion."""
|
||||
source_code: str
|
||||
document_id: Optional[str] = None
|
||||
chunks_created: int
|
||||
message: str
|
||||
|
||||
|
||||
class LicenseInfo(BaseModel):
|
||||
"""License information."""
|
||||
code: str
|
||||
name: str
|
||||
url: Optional[str] = None
|
||||
attribution_required: bool
|
||||
modification_allowed: bool
|
||||
commercial_use: bool
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Dependency Injection
|
||||
# =============================================================================
|
||||
|
||||
# Database pool (will be set from main.py)
|
||||
_db_pool = None
|
||||
|
||||
|
||||
def set_db_pool(pool):
|
||||
"""Set the database pool for API endpoints."""
|
||||
global _db_pool
|
||||
_db_pool = pool
|
||||
|
||||
|
||||
async def get_store() -> DSFACorpusStore:
|
||||
"""Get DSFA corpus store."""
|
||||
if _db_pool is None:
|
||||
raise HTTPException(status_code=503, detail="Database not initialized")
|
||||
return DSFACorpusStore(_db_pool)
|
||||
|
||||
|
||||
async def get_qdrant() -> DSFAQdrantService:
|
||||
"""Get Qdrant service."""
|
||||
return DSFAQdrantService()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Embedding Service Integration
|
||||
# =============================================================================
|
||||
|
||||
async def get_embedding(text: str) -> List[float]:
|
||||
"""
|
||||
Get embedding for text using the embedding-service.
|
||||
|
||||
Uses BGE-M3 model which produces 1024-dimensional vectors.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/embed-single",
|
||||
json={"text": text}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("embedding", [])
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"Embedding service error: {e}")
|
||||
# Fallback to hash-based pseudo-embedding for development
|
||||
return _generate_fallback_embedding(text)
|
||||
|
||||
|
||||
async def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Get embeddings for multiple texts in batch.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/embed",
|
||||
json={"texts": texts}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("embeddings", [])
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"Embedding service batch error: {e}")
|
||||
# Fallback
|
||||
return [_generate_fallback_embedding(t) for t in texts]
|
||||
|
||||
|
||||
async def extract_text_from_url(url: str) -> str:
|
||||
"""
|
||||
Extract text from a document URL (PDF, HTML, etc.).
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
try:
|
||||
# First try to use the embedding-service's extract-pdf endpoint
|
||||
response = await client.post(
|
||||
f"{EMBEDDING_SERVICE_URL}/extract-pdf",
|
||||
json={"url": url}
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("text", "")
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"PDF extraction error for {url}: {e}")
|
||||
# Fallback: try to fetch HTML content directly
|
||||
try:
|
||||
response = await client.get(url, follow_redirects=True)
|
||||
response.raise_for_status()
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "html" in content_type:
|
||||
# Simple HTML text extraction
|
||||
import re
|
||||
html = response.text
|
||||
# Remove scripts and styles
|
||||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
# Remove tags
|
||||
text = re.sub(r'<[^>]+>', ' ', html)
|
||||
# Clean whitespace
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
else:
|
||||
return ""
|
||||
except Exception as fetch_err:
|
||||
logger.error(f"Fallback fetch error for {url}: {fetch_err}")
|
||||
return ""
|
||||
|
||||
|
||||
def _generate_fallback_embedding(text: str) -> List[float]:
|
||||
"""
|
||||
Generate deterministic pseudo-embedding from text hash.
|
||||
Used as fallback when embedding service is unavailable.
|
||||
"""
|
||||
import hashlib
|
||||
import struct
|
||||
|
||||
hash_bytes = hashlib.sha256(text.encode()).digest()
|
||||
embedding = []
|
||||
for i in range(0, min(len(hash_bytes), 128), 4):
|
||||
val = struct.unpack('f', hash_bytes[i:i+4])[0]
|
||||
embedding.append(val % 1.0)
|
||||
|
||||
# Pad to 1024 dimensions
|
||||
while len(embedding) < 1024:
|
||||
embedding.extend(embedding[:min(len(embedding), 1024 - len(embedding))])
|
||||
|
||||
return embedding[:1024]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# API Endpoints
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/search", response_model=DSFASearchResponse)
|
||||
async def search_dsfa_corpus(
|
||||
query: str = Query(..., min_length=3, description="Search query"),
|
||||
source_codes: Optional[List[str]] = Query(None, description="Filter by source codes"),
|
||||
document_types: Optional[List[str]] = Query(None, description="Filter by document types (guideline, checklist, regulation)"),
|
||||
categories: Optional[List[str]] = Query(None, description="Filter by categories (threshold_analysis, risk_assessment, mitigation)"),
|
||||
limit: int = Query(10, ge=1, le=50, description="Maximum results"),
|
||||
include_attribution: bool = Query(True, description="Include attribution in results"),
|
||||
store: DSFACorpusStore = Depends(get_store),
|
||||
qdrant: DSFAQdrantService = Depends(get_qdrant)
|
||||
):
|
||||
"""
|
||||
Search DSFA corpus with full attribution.
|
||||
|
||||
Returns matching chunks with source/license information for compliance.
|
||||
"""
|
||||
# Get query embedding
|
||||
query_embedding = await get_embedding(query)
|
||||
|
||||
# Search Qdrant
|
||||
raw_results = await qdrant.search(
|
||||
query_embedding=query_embedding,
|
||||
source_codes=source_codes,
|
||||
document_types=document_types,
|
||||
categories=categories,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
# Transform results
|
||||
results = []
|
||||
licenses_used = set()
|
||||
|
||||
for r in raw_results:
|
||||
license_code = r.get("license_code", "")
|
||||
license_info = LICENSE_REGISTRY.get(license_code, {})
|
||||
|
||||
result = DSFASearchResultResponse(
|
||||
chunk_id=r.get("chunk_id", ""),
|
||||
content=r.get("content", ""),
|
||||
score=r.get("score", 0.0),
|
||||
source_code=r.get("source_code", ""),
|
||||
source_name=r.get("source_name", ""),
|
||||
attribution_text=r.get("attribution_text", ""),
|
||||
license_code=license_code,
|
||||
license_name=license_info.get("name", license_code),
|
||||
license_url=license_info.get("url"),
|
||||
attribution_required=r.get("attribution_required", True),
|
||||
source_url=r.get("source_url"),
|
||||
document_type=r.get("document_type"),
|
||||
category=r.get("category"),
|
||||
section_title=r.get("section_title"),
|
||||
page_number=r.get("page_number")
|
||||
)
|
||||
results.append(result)
|
||||
licenses_used.add(license_code)
|
||||
|
||||
# Generate attribution notice
|
||||
search_results = [
|
||||
DSFASearchResult(
|
||||
chunk_id=r.chunk_id,
|
||||
content=r.content,
|
||||
score=r.score,
|
||||
source_code=r.source_code,
|
||||
source_name=r.source_name,
|
||||
attribution_text=r.attribution_text,
|
||||
license_code=r.license_code,
|
||||
license_url=r.license_url,
|
||||
attribution_required=r.attribution_required,
|
||||
source_url=r.source_url,
|
||||
document_type=r.document_type or "",
|
||||
category=r.category or "",
|
||||
section_title=r.section_title,
|
||||
page_number=r.page_number
|
||||
)
|
||||
for r in results
|
||||
]
|
||||
attribution_notice = generate_attribution_notice(search_results) if include_attribution else ""
|
||||
|
||||
return DSFASearchResponse(
|
||||
query=query,
|
||||
results=results,
|
||||
total_results=len(results),
|
||||
licenses_used=list(licenses_used),
|
||||
attribution_notice=attribution_notice
|
||||
)
|
||||
|
||||
|
||||
@router.get("/sources", response_model=List[DSFASourceResponse])
|
||||
async def list_dsfa_sources(
|
||||
document_type: Optional[str] = Query(None, description="Filter by document type"),
|
||||
license_code: Optional[str] = Query(None, description="Filter by license"),
|
||||
store: DSFACorpusStore = Depends(get_store)
|
||||
):
|
||||
"""List all registered DSFA sources with license info."""
|
||||
sources = await store.list_sources()
|
||||
|
||||
result = []
|
||||
for s in sources:
|
||||
# Apply filters
|
||||
if document_type and s.get("document_type") != document_type:
|
||||
continue
|
||||
if license_code and s.get("license_code") != license_code:
|
||||
continue
|
||||
|
||||
license_info = LICENSE_REGISTRY.get(s.get("license_code", ""), {})
|
||||
|
||||
result.append(DSFASourceResponse(
|
||||
id=str(s["id"]),
|
||||
source_code=s["source_code"],
|
||||
name=s["name"],
|
||||
full_name=s.get("full_name"),
|
||||
organization=s.get("organization"),
|
||||
source_url=s.get("source_url"),
|
||||
license_code=s.get("license_code", ""),
|
||||
license_name=license_info.get("name", s.get("license_code", "")),
|
||||
license_url=license_info.get("url"),
|
||||
attribution_required=s.get("attribution_required", True),
|
||||
attribution_text=s.get("attribution_text", ""),
|
||||
document_type=s.get("document_type"),
|
||||
language=s.get("language", "de")
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@router.get("/sources/available")
|
||||
async def list_available_sources():
|
||||
"""List all available source definitions (from DSFA_SOURCES constant)."""
|
||||
return [
|
||||
{
|
||||
"source_code": s["source_code"],
|
||||
"name": s["name"],
|
||||
"organization": s.get("organization"),
|
||||
"license_code": s["license_code"],
|
||||
"document_type": s.get("document_type")
|
||||
}
|
||||
for s in DSFA_SOURCES
|
||||
]
|
||||
|
||||
|
||||
@router.get("/sources/{source_code}", response_model=DSFASourceResponse)
|
||||
async def get_dsfa_source(
|
||||
source_code: str,
|
||||
store: DSFACorpusStore = Depends(get_store)
|
||||
):
|
||||
"""Get details for a specific source."""
|
||||
source = await store.get_source_by_code(source_code)
|
||||
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail=f"Source not found: {source_code}")
|
||||
|
||||
license_info = LICENSE_REGISTRY.get(source.get("license_code", ""), {})
|
||||
|
||||
return DSFASourceResponse(
|
||||
id=str(source["id"]),
|
||||
source_code=source["source_code"],
|
||||
name=source["name"],
|
||||
full_name=source.get("full_name"),
|
||||
organization=source.get("organization"),
|
||||
source_url=source.get("source_url"),
|
||||
license_code=source.get("license_code", ""),
|
||||
license_name=license_info.get("name", source.get("license_code", "")),
|
||||
license_url=license_info.get("url"),
|
||||
attribution_required=source.get("attribution_required", True),
|
||||
attribution_text=source.get("attribution_text", ""),
|
||||
document_type=source.get("document_type"),
|
||||
language=source.get("language", "de")
|
||||
)
|
||||
|
||||
|
||||
@router.post("/sources/{source_code}/ingest", response_model=IngestResponse)
|
||||
async def ingest_dsfa_source(
|
||||
source_code: str,
|
||||
request: IngestRequest,
|
||||
store: DSFACorpusStore = Depends(get_store),
|
||||
qdrant: DSFAQdrantService = Depends(get_qdrant)
|
||||
):
|
||||
"""
|
||||
Trigger ingestion for a specific source.
|
||||
|
||||
Can provide document via URL or direct text.
|
||||
"""
|
||||
# Get source
|
||||
source = await store.get_source_by_code(source_code)
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail=f"Source not found: {source_code}")
|
||||
|
||||
# Need either URL or text
|
||||
if not request.document_text and not request.document_url:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Either document_text or document_url must be provided"
|
||||
)
|
||||
|
||||
# Ensure Qdrant collection exists
|
||||
await qdrant.ensure_collection()
|
||||
|
||||
# Get text content
|
||||
text_content = request.document_text
|
||||
if request.document_url and not text_content:
|
||||
# Download and extract text from URL
|
||||
logger.info(f"Extracting text from URL: {request.document_url}")
|
||||
text_content = await extract_text_from_url(request.document_url)
|
||||
if not text_content:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Could not extract text from URL: {request.document_url}"
|
||||
)
|
||||
|
||||
if not text_content or len(text_content.strip()) < 50:
|
||||
raise HTTPException(status_code=400, detail="Document text too short (min 50 chars)")
|
||||
|
||||
# Create document record
|
||||
doc_title = request.title or f"Document for {source_code}"
|
||||
document_id = await store.create_document(
|
||||
source_id=str(source["id"]),
|
||||
title=doc_title,
|
||||
file_type="text",
|
||||
metadata={"ingested_via": "api", "source_code": source_code}
|
||||
)
|
||||
|
||||
# Chunk the document
|
||||
chunks = chunk_document(text_content, source_code)
|
||||
|
||||
if not chunks:
|
||||
return IngestResponse(
|
||||
source_code=source_code,
|
||||
document_id=document_id,
|
||||
chunks_created=0,
|
||||
message="Document created but no chunks generated"
|
||||
)
|
||||
|
||||
# Generate embeddings in batch for efficiency
|
||||
chunk_texts = [chunk["content"] for chunk in chunks]
|
||||
logger.info(f"Generating embeddings for {len(chunk_texts)} chunks...")
|
||||
embeddings = await get_embeddings_batch(chunk_texts)
|
||||
|
||||
# Create chunk records in PostgreSQL and prepare for Qdrant
|
||||
chunk_records = []
|
||||
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
|
||||
# Create chunk in PostgreSQL
|
||||
chunk_id = await store.create_chunk(
|
||||
document_id=document_id,
|
||||
source_id=str(source["id"]),
|
||||
content=chunk["content"],
|
||||
chunk_index=i,
|
||||
section_title=chunk.get("section_title"),
|
||||
page_number=chunk.get("page_number"),
|
||||
category=chunk.get("category")
|
||||
)
|
||||
|
||||
chunk_records.append({
|
||||
"chunk_id": chunk_id,
|
||||
"document_id": document_id,
|
||||
"source_id": str(source["id"]),
|
||||
"content": chunk["content"],
|
||||
"section_title": chunk.get("section_title"),
|
||||
"source_code": source_code,
|
||||
"source_name": source["name"],
|
||||
"attribution_text": source["attribution_text"],
|
||||
"license_code": source["license_code"],
|
||||
"attribution_required": source.get("attribution_required", True),
|
||||
"document_type": source.get("document_type", ""),
|
||||
"category": chunk.get("category", ""),
|
||||
"language": source.get("language", "de"),
|
||||
"page_number": chunk.get("page_number")
|
||||
})
|
||||
|
||||
# Index in Qdrant
|
||||
indexed_count = await qdrant.index_chunks(chunk_records, embeddings)
|
||||
|
||||
# Update document record
|
||||
await store.update_document_indexed(document_id, len(chunks))
|
||||
|
||||
return IngestResponse(
|
||||
source_code=source_code,
|
||||
document_id=document_id,
|
||||
chunks_created=indexed_count,
|
||||
message=f"Successfully ingested {indexed_count} chunks from document"
|
||||
)
|
||||
|
||||
|
||||
@router.get("/chunks/{chunk_id}", response_model=DSFAChunkResponse)
|
||||
async def get_chunk_with_attribution(
|
||||
chunk_id: str,
|
||||
store: DSFACorpusStore = Depends(get_store)
|
||||
):
|
||||
"""Get single chunk with full source attribution."""
|
||||
chunk = await store.get_chunk_with_attribution(chunk_id)
|
||||
|
||||
if not chunk:
|
||||
raise HTTPException(status_code=404, detail=f"Chunk not found: {chunk_id}")
|
||||
|
||||
license_info = LICENSE_REGISTRY.get(chunk.get("license_code", ""), {})
|
||||
|
||||
return DSFAChunkResponse(
|
||||
chunk_id=str(chunk["chunk_id"]),
|
||||
content=chunk.get("content", ""),
|
||||
section_title=chunk.get("section_title"),
|
||||
page_number=chunk.get("page_number"),
|
||||
category=chunk.get("category"),
|
||||
document_id=str(chunk.get("document_id", "")),
|
||||
document_title=chunk.get("document_title"),
|
||||
source_id=str(chunk.get("source_id", "")),
|
||||
source_code=chunk.get("source_code", ""),
|
||||
source_name=chunk.get("source_name", ""),
|
||||
attribution_text=chunk.get("attribution_text", ""),
|
||||
license_code=chunk.get("license_code", ""),
|
||||
license_name=license_info.get("name", chunk.get("license_code", "")),
|
||||
license_url=license_info.get("url"),
|
||||
attribution_required=chunk.get("attribution_required", True),
|
||||
source_url=chunk.get("source_url"),
|
||||
document_type=chunk.get("document_type")
|
||||
)
|
||||
|
||||
|
||||
@router.get("/stats", response_model=DSFACorpusStatsResponse)
|
||||
async def get_corpus_stats(
|
||||
store: DSFACorpusStore = Depends(get_store),
|
||||
qdrant: DSFAQdrantService = Depends(get_qdrant)
|
||||
):
|
||||
"""Get corpus statistics for dashboard."""
|
||||
# Get PostgreSQL stats
|
||||
source_stats = await store.get_source_stats()
|
||||
|
||||
total_docs = 0
|
||||
total_chunks = 0
|
||||
stats_response = []
|
||||
|
||||
for s in source_stats:
|
||||
doc_count = s.get("document_count", 0) or 0
|
||||
chunk_count = s.get("chunk_count", 0) or 0
|
||||
total_docs += doc_count
|
||||
total_chunks += chunk_count
|
||||
|
||||
last_indexed = s.get("last_indexed_at")
|
||||
|
||||
stats_response.append(DSFASourceStatsResponse(
|
||||
source_id=str(s.get("source_id", "")),
|
||||
source_code=s.get("source_code", ""),
|
||||
name=s.get("name", ""),
|
||||
organization=s.get("organization"),
|
||||
license_code=s.get("license_code", ""),
|
||||
document_type=s.get("document_type"),
|
||||
document_count=doc_count,
|
||||
chunk_count=chunk_count,
|
||||
last_indexed_at=last_indexed.isoformat() if last_indexed else None
|
||||
))
|
||||
|
||||
# Get Qdrant stats
|
||||
qdrant_stats = await qdrant.get_stats()
|
||||
|
||||
return DSFACorpusStatsResponse(
|
||||
sources=stats_response,
|
||||
total_sources=len(source_stats),
|
||||
total_documents=total_docs,
|
||||
total_chunks=total_chunks,
|
||||
qdrant_collection=DSFA_COLLECTION,
|
||||
qdrant_points_count=qdrant_stats.get("points_count", 0),
|
||||
qdrant_status=qdrant_stats.get("status", "unknown")
|
||||
)
|
||||
|
||||
|
||||
@router.get("/licenses")
|
||||
async def list_licenses():
|
||||
"""List all supported licenses with their terms."""
|
||||
return [
|
||||
LicenseInfo(
|
||||
code=code,
|
||||
name=info.get("name", code),
|
||||
url=info.get("url"),
|
||||
attribution_required=info.get("attribution_required", True),
|
||||
modification_allowed=info.get("modification_allowed", True),
|
||||
commercial_use=info.get("commercial_use", True)
|
||||
)
|
||||
for code, info in LICENSE_REGISTRY.items()
|
||||
]
|
||||
|
||||
|
||||
@router.post("/init")
|
||||
async def initialize_dsfa_corpus(
|
||||
store: DSFACorpusStore = Depends(get_store),
|
||||
qdrant: DSFAQdrantService = Depends(get_qdrant)
|
||||
):
|
||||
"""
|
||||
Initialize DSFA corpus.
|
||||
|
||||
- Creates Qdrant collection
|
||||
- Registers all predefined sources
|
||||
"""
|
||||
# Ensure Qdrant collection exists
|
||||
qdrant_ok = await qdrant.ensure_collection()
|
||||
|
||||
# Register all sources
|
||||
registered = 0
|
||||
for source in DSFA_SOURCES:
|
||||
try:
|
||||
await store.register_source(source)
|
||||
registered += 1
|
||||
except Exception as e:
|
||||
print(f"Error registering source {source['source_code']}: {e}")
|
||||
|
||||
return {
|
||||
"qdrant_collection_created": qdrant_ok,
|
||||
"sources_registered": registered,
|
||||
"total_sources": len(DSFA_SOURCES)
|
||||
}
|
||||
Reference in New Issue
Block a user