breakpilot-compliance/backend-compliance/compliance/api/scraper_routes.py

"""
FastAPI routes for Regulation Scraper and PDF extraction.

Endpoints:
- /scraper/status: Scraper status
- /scraper/sources: Available sources
- /scraper/scrape-all: Scrape all sources
- /scraper/scrape/{code}: Scrape single source
- /scraper/extract-bsi: Extract BSI requirements
- /scraper/extract-pdf: Extract from PDF
- /scraper/pdf-documents: List available PDFs
"""

import logging
import os
import uuid

from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
from sqlalchemy.orm import Session

from classroom_engine.database import get_db

from ..db import RegulationRepository, RequirementRepository
from ..db.models import RequirementDB
from .schemas import BSIAspectResponse, PDFExtractionResponse, PDFExtractionRequest

logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-scraper"])


# ============================================================================
# Regulation Scraper
# ============================================================================

@router.get("/scraper/status")
async def get_scraper_status(db: Session = Depends(get_db)):
    """Get current scraper status."""
    from ..services.regulation_scraper import RegulationScraperService

    scraper = RegulationScraperService(db)
    return await scraper.get_status()


@router.get("/scraper/sources")
async def get_scraper_sources(db: Session = Depends(get_db)):
    """Get list of known regulation sources."""
    from ..services.regulation_scraper import RegulationScraperService

    scraper = RegulationScraperService(db)
    return {
        "sources": scraper.get_known_sources(),
        "total": len(scraper.KNOWN_SOURCES),
    }


@router.post("/scraper/scrape-all")
async def scrape_all_sources(
    background_tasks: BackgroundTasks,
    db: Session = Depends(get_db),
):
    """Start scraping all known regulation sources."""
    from ..services.regulation_scraper import RegulationScraperService

    scraper = RegulationScraperService(db)
    results = await scraper.scrape_all()
    return {
        "status": "completed",
        "results": results,
    }


@router.post("/scraper/scrape/{code}")
async def scrape_single_source(
    code: str,
    force: bool = Query(False, description="Force re-scrape even if data exists"),
    db: Session = Depends(get_db),
):
    """Scrape a specific regulation source."""
    from ..services.regulation_scraper import RegulationScraperService

    scraper = RegulationScraperService(db)

    try:
        result = await scraper.scrape_single(code, force=force)
        return result
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except Exception as e:
        logger.error(f"Scraping {code} failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.post("/scraper/extract-bsi")
async def extract_bsi_requirements(
    code: str = Query("BSI-TR-03161-2", description="BSI TR code"),
    force: bool = Query(False),
    db: Session = Depends(get_db),
):
    """
    Extract requirements from BSI Technical Guidelines.

    Uses pre-defined Pruefaspekte from BSI-TR-03161 documents.
    """
    from ..services.regulation_scraper import RegulationScraperService

    if not code.startswith("BSI"):
        raise HTTPException(status_code=400, detail="Only BSI codes are supported")

    scraper = RegulationScraperService(db)

    try:
        result = await scraper.scrape_single(code, force=force)
        return result
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except Exception as e:
        logger.error(f"BSI extraction failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.post("/scraper/extract-pdf", response_model=PDFExtractionResponse)
async def extract_pdf_requirements(
    request: PDFExtractionRequest,
    db: Session = Depends(get_db),
):
    """
    Extract Pruefaspekte from BSI-TR PDF documents using PyMuPDF.

    Supported documents:
    - BSI-TR-03161-1: General security requirements
    - BSI-TR-03161-2: Web application security (OAuth, Sessions, etc.)
    - BSI-TR-03161-3: Backend/server security
    """
    from ..services.pdf_extractor import BSIPDFExtractor
    from ..db.models import RegulationTypeEnum

    # Map document codes to file paths
    PDF_PATHS = {
        "BSI-TR-03161-1": "/app/docs/BSI-TR-03161-1.pdf",
        "BSI-TR-03161-2": "/app/docs/BSI-TR-03161-2.pdf",
        "BSI-TR-03161-3": "/app/docs/BSI-TR-03161-3.pdf",
    }

    # Local development paths (fallback)
    LOCAL_PDF_PATHS = {
        "BSI-TR-03161-1": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-1.pdf",
        "BSI-TR-03161-2": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-2.pdf",
        "BSI-TR-03161-3": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-3.pdf",
    }

    doc_code = request.document_code.upper()
    if doc_code not in PDF_PATHS:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported document: {doc_code}. Supported: {list(PDF_PATHS.keys())}"
        )

    # Try container path first, then local path
    pdf_path = PDF_PATHS[doc_code]
    if not os.path.exists(pdf_path):
        pdf_path = LOCAL_PDF_PATHS.get(doc_code)
        if not pdf_path or not os.path.exists(pdf_path):
            raise HTTPException(
                status_code=404,
                detail=f"PDF file not found for {doc_code}"
            )

    try:
        extractor = BSIPDFExtractor()
        aspects = extractor.extract_from_file(pdf_path, source_name=doc_code)
        stats = extractor.get_statistics(aspects)

        # Convert to response format
        aspect_responses = [
            BSIAspectResponse(
                aspect_id=a.aspect_id,
                title=a.title,
                full_text=a.full_text[:2000],
                category=a.category.value,
                page_number=a.page_number,
                section=a.section,
                requirement_level=a.requirement_level.value,
                source_document=a.source_document,
                keywords=a.keywords,
                related_aspects=a.related_aspects,
            )
            for a in aspects
        ]

        requirements_created = 0

        # Save to database if requested
        if request.save_to_db:
            # Get or create regulation
            reg_repo = RegulationRepository(db)
            regulation = reg_repo.get_by_code(doc_code)

            if not regulation:
                regulation = reg_repo.create(
                    code=doc_code,
                    name=f"BSI TR {doc_code.split('-')[-1]}",
                    full_name=f"BSI Technische Richtlinie {doc_code}",
                    regulation_type=RegulationTypeEnum.BSI_STANDARD,
                    local_pdf_path=pdf_path,
                )

            # Create requirements from extracted aspects
            req_repo = RequirementRepository(db)
            existing_articles = {r.article for r in req_repo.get_by_regulation(regulation.id)}

            for aspect in aspects:
                if aspect.aspect_id not in existing_articles or request.force:
                    # Delete existing if force
                    if request.force and aspect.aspect_id in existing_articles:
                        existing = db.query(RequirementDB).filter(
                            RequirementDB.regulation_id == regulation.id,
                            RequirementDB.article == aspect.aspect_id
                        ).first()
                        if existing:
                            db.delete(existing)

                    # Determine priority based on requirement level
                    priority_map = {"MUSS": 3, "SOLL": 2, "KANN": 1, "DARF NICHT": 3}
                    priority = priority_map.get(aspect.requirement_level.value, 2)

                    requirement = RequirementDB(
                        id=str(uuid.uuid4()),
                        regulation_id=regulation.id,
                        article=aspect.aspect_id,
                        paragraph=aspect.section,
                        title=aspect.title[:300],
                        description=f"Kategorie: {aspect.category.value}",
                        requirement_text=aspect.full_text[:4000],
                        is_applicable=True,
                        priority=priority,
                        source_page=aspect.page_number,
                        source_section=aspect.section,
                    )
                    db.add(requirement)
                    requirements_created += 1

            db.commit()

        return PDFExtractionResponse(
            success=True,
            source_document=doc_code,
            total_aspects=len(aspects),
            aspects=aspect_responses,
            statistics=stats,
            requirements_created=requirements_created,
        )

    except ImportError as e:
        raise HTTPException(
            status_code=500,
            detail=f"PyMuPDF not installed: {e}. Run: pip install PyMuPDF"
        )
    except Exception as e:
        logger.error(f"PDF extraction failed for {doc_code}: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/scraper/pdf-documents")
async def list_pdf_documents():
    """List available PDF documents for extraction."""
    PDF_DOCS = [
        {
            "code": "BSI-TR-03161-1",
            "name": "BSI TR 03161 Teil 1",
            "description": "Allgemeine Sicherheitsanforderungen für mobile Anwendungen",
            "expected_aspects": "~30",
        },
        {
            "code": "BSI-TR-03161-2",
            "name": "BSI TR 03161 Teil 2",
            "description": "Web-Anwendungssicherheit (OAuth, Sessions, Input Validation, etc.)",
            "expected_aspects": "~80-100",
        },
        {
            "code": "BSI-TR-03161-3",
            "name": "BSI TR 03161 Teil 3",
            "description": "Backend/Server-Sicherheit",
            "expected_aspects": "~40",
        },
    ]

    # Check which PDFs exist
    for doc in PDF_DOCS:
        local_path = f"/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/{doc['code']}.pdf"
        container_path = f"/app/docs/{doc['code']}.pdf"
        doc["available"] = os.path.exists(local_path) or os.path.exists(container_path)

    return {
        "documents": PDF_DOCS,
        "total": len(PDF_DOCS),
    }