""" FastAPI routes for Regulation Scraper and PDF extraction. Endpoints: - /scraper/status: Scraper status - /scraper/sources: Available sources - /scraper/scrape-all: Scrape all sources - /scraper/scrape/{code}: Scrape single source - /scraper/extract-bsi: Extract BSI requirements - /scraper/extract-pdf: Extract from PDF - /scraper/pdf-documents: List available PDFs """ import logging import os import uuid from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks from sqlalchemy.orm import Session from classroom_engine.database import get_db from ..db import RegulationRepository, RequirementRepository from ..db.models import RequirementDB from .schemas import BSIAspectResponse, PDFExtractionResponse, PDFExtractionRequest logger = logging.getLogger(__name__) router = APIRouter(tags=["compliance-scraper"]) # ============================================================================ # Regulation Scraper # ============================================================================ @router.get("/scraper/status") async def get_scraper_status(db: Session = Depends(get_db)): """Get current scraper status.""" from ..services.regulation_scraper import RegulationScraperService scraper = RegulationScraperService(db) return await scraper.get_status() @router.get("/scraper/sources") async def get_scraper_sources(db: Session = Depends(get_db)): """Get list of known regulation sources.""" from ..services.regulation_scraper import RegulationScraperService scraper = RegulationScraperService(db) return { "sources": scraper.get_known_sources(), "total": len(scraper.KNOWN_SOURCES), } @router.post("/scraper/scrape-all") async def scrape_all_sources( background_tasks: BackgroundTasks, db: Session = Depends(get_db), ): """Start scraping all known regulation sources.""" from ..services.regulation_scraper import RegulationScraperService scraper = RegulationScraperService(db) results = await scraper.scrape_all() return { "status": "completed", "results": results, } @router.post("/scraper/scrape/{code}") async def scrape_single_source( code: str, force: bool = Query(False, description="Force re-scrape even if data exists"), db: Session = Depends(get_db), ): """Scrape a specific regulation source.""" from ..services.regulation_scraper import RegulationScraperService scraper = RegulationScraperService(db) try: result = await scraper.scrape_single(code, force=force) return result except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except Exception as e: logger.error(f"Scraping {code} failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/scraper/extract-bsi") async def extract_bsi_requirements( code: str = Query("BSI-TR-03161-2", description="BSI TR code"), force: bool = Query(False), db: Session = Depends(get_db), ): """ Extract requirements from BSI Technical Guidelines. Uses pre-defined Pruefaspekte from BSI-TR-03161 documents. """ from ..services.regulation_scraper import RegulationScraperService if not code.startswith("BSI"): raise HTTPException(status_code=400, detail="Only BSI codes are supported") scraper = RegulationScraperService(db) try: result = await scraper.scrape_single(code, force=force) return result except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except Exception as e: logger.error(f"BSI extraction failed: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/scraper/extract-pdf", response_model=PDFExtractionResponse) async def extract_pdf_requirements( request: PDFExtractionRequest, db: Session = Depends(get_db), ): """ Extract Pruefaspekte from BSI-TR PDF documents using PyMuPDF. Supported documents: - BSI-TR-03161-1: General security requirements - BSI-TR-03161-2: Web application security (OAuth, Sessions, etc.) - BSI-TR-03161-3: Backend/server security """ from ..services.pdf_extractor import BSIPDFExtractor from ..db.models import RegulationTypeEnum # Map document codes to file paths PDF_PATHS = { "BSI-TR-03161-1": "/app/docs/BSI-TR-03161-1.pdf", "BSI-TR-03161-2": "/app/docs/BSI-TR-03161-2.pdf", "BSI-TR-03161-3": "/app/docs/BSI-TR-03161-3.pdf", } # Local development paths (fallback) LOCAL_PDF_PATHS = { "BSI-TR-03161-1": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-1.pdf", "BSI-TR-03161-2": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-2.pdf", "BSI-TR-03161-3": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-3.pdf", } doc_code = request.document_code.upper() if doc_code not in PDF_PATHS: raise HTTPException( status_code=400, detail=f"Unsupported document: {doc_code}. Supported: {list(PDF_PATHS.keys())}" ) # Try container path first, then local path pdf_path = PDF_PATHS[doc_code] if not os.path.exists(pdf_path): pdf_path = LOCAL_PDF_PATHS.get(doc_code) if not pdf_path or not os.path.exists(pdf_path): raise HTTPException( status_code=404, detail=f"PDF file not found for {doc_code}" ) try: extractor = BSIPDFExtractor() aspects = extractor.extract_from_file(pdf_path, source_name=doc_code) stats = extractor.get_statistics(aspects) # Convert to response format aspect_responses = [ BSIAspectResponse( aspect_id=a.aspect_id, title=a.title, full_text=a.full_text[:2000], category=a.category.value, page_number=a.page_number, section=a.section, requirement_level=a.requirement_level.value, source_document=a.source_document, keywords=a.keywords, related_aspects=a.related_aspects, ) for a in aspects ] requirements_created = 0 # Save to database if requested if request.save_to_db: # Get or create regulation reg_repo = RegulationRepository(db) regulation = reg_repo.get_by_code(doc_code) if not regulation: regulation = reg_repo.create( code=doc_code, name=f"BSI TR {doc_code.split('-')[-1]}", full_name=f"BSI Technische Richtlinie {doc_code}", regulation_type=RegulationTypeEnum.BSI_STANDARD, local_pdf_path=pdf_path, ) # Create requirements from extracted aspects req_repo = RequirementRepository(db) existing_articles = {r.article for r in req_repo.get_by_regulation(regulation.id)} for aspect in aspects: if aspect.aspect_id not in existing_articles or request.force: # Delete existing if force if request.force and aspect.aspect_id in existing_articles: existing = db.query(RequirementDB).filter( RequirementDB.regulation_id == regulation.id, RequirementDB.article == aspect.aspect_id ).first() if existing: db.delete(existing) # Determine priority based on requirement level priority_map = {"MUSS": 3, "SOLL": 2, "KANN": 1, "DARF NICHT": 3} priority = priority_map.get(aspect.requirement_level.value, 2) requirement = RequirementDB( id=str(uuid.uuid4()), regulation_id=regulation.id, article=aspect.aspect_id, paragraph=aspect.section, title=aspect.title[:300], description=f"Kategorie: {aspect.category.value}", requirement_text=aspect.full_text[:4000], is_applicable=True, priority=priority, source_page=aspect.page_number, source_section=aspect.section, ) db.add(requirement) requirements_created += 1 db.commit() return PDFExtractionResponse( success=True, source_document=doc_code, total_aspects=len(aspects), aspects=aspect_responses, statistics=stats, requirements_created=requirements_created, ) except ImportError as e: raise HTTPException( status_code=500, detail=f"PyMuPDF not installed: {e}. Run: pip install PyMuPDF" ) except Exception as e: logger.error(f"PDF extraction failed for {doc_code}: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.get("/scraper/pdf-documents") async def list_pdf_documents(): """List available PDF documents for extraction.""" PDF_DOCS = [ { "code": "BSI-TR-03161-1", "name": "BSI TR 03161 Teil 1", "description": "Allgemeine Sicherheitsanforderungen für mobile Anwendungen", "expected_aspects": "~30", }, { "code": "BSI-TR-03161-2", "name": "BSI TR 03161 Teil 2", "description": "Web-Anwendungssicherheit (OAuth, Sessions, Input Validation, etc.)", "expected_aspects": "~80-100", }, { "code": "BSI-TR-03161-3", "name": "BSI TR 03161 Teil 3", "description": "Backend/Server-Sicherheit", "expected_aspects": "~40", }, ] # Check which PDFs exist for doc in PDF_DOCS: local_path = f"/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/{doc['code']}.pdf" container_path = f"/app/docs/{doc['code']}.pdf" doc["available"] = os.path.exists(local_path) or os.path.exists(container_path) return { "documents": PDF_DOCS, "total": len(PDF_DOCS), }