Services: Admin-Compliance, Backend-Compliance, AI-Compliance-SDK, Consent-SDK, Developer-Portal, PCA-Platform, DSMS Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
297 lines
10 KiB
Python
297 lines
10 KiB
Python
"""
|
|
FastAPI routes for Regulation Scraper and PDF extraction.
|
|
|
|
Endpoints:
|
|
- /scraper/status: Scraper status
|
|
- /scraper/sources: Available sources
|
|
- /scraper/scrape-all: Scrape all sources
|
|
- /scraper/scrape/{code}: Scrape single source
|
|
- /scraper/extract-bsi: Extract BSI requirements
|
|
- /scraper/extract-pdf: Extract from PDF
|
|
- /scraper/pdf-documents: List available PDFs
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import uuid
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
|
|
from sqlalchemy.orm import Session
|
|
|
|
from classroom_engine.database import get_db
|
|
|
|
from ..db import RegulationRepository, RequirementRepository
|
|
from ..db.models import RequirementDB
|
|
from .schemas import BSIAspectResponse, PDFExtractionResponse, PDFExtractionRequest
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(tags=["compliance-scraper"])
|
|
|
|
|
|
# ============================================================================
|
|
# Regulation Scraper
|
|
# ============================================================================
|
|
|
|
@router.get("/scraper/status")
|
|
async def get_scraper_status(db: Session = Depends(get_db)):
|
|
"""Get current scraper status."""
|
|
from ..services.regulation_scraper import RegulationScraperService
|
|
|
|
scraper = RegulationScraperService(db)
|
|
return await scraper.get_status()
|
|
|
|
|
|
@router.get("/scraper/sources")
|
|
async def get_scraper_sources(db: Session = Depends(get_db)):
|
|
"""Get list of known regulation sources."""
|
|
from ..services.regulation_scraper import RegulationScraperService
|
|
|
|
scraper = RegulationScraperService(db)
|
|
return {
|
|
"sources": scraper.get_known_sources(),
|
|
"total": len(scraper.KNOWN_SOURCES),
|
|
}
|
|
|
|
|
|
@router.post("/scraper/scrape-all")
|
|
async def scrape_all_sources(
|
|
background_tasks: BackgroundTasks,
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""Start scraping all known regulation sources."""
|
|
from ..services.regulation_scraper import RegulationScraperService
|
|
|
|
scraper = RegulationScraperService(db)
|
|
results = await scraper.scrape_all()
|
|
return {
|
|
"status": "completed",
|
|
"results": results,
|
|
}
|
|
|
|
|
|
@router.post("/scraper/scrape/{code}")
|
|
async def scrape_single_source(
|
|
code: str,
|
|
force: bool = Query(False, description="Force re-scrape even if data exists"),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""Scrape a specific regulation source."""
|
|
from ..services.regulation_scraper import RegulationScraperService
|
|
|
|
scraper = RegulationScraperService(db)
|
|
|
|
try:
|
|
result = await scraper.scrape_single(code, force=force)
|
|
return result
|
|
except ValueError as e:
|
|
raise HTTPException(status_code=400, detail=str(e))
|
|
except Exception as e:
|
|
logger.error(f"Scraping {code} failed: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/scraper/extract-bsi")
|
|
async def extract_bsi_requirements(
|
|
code: str = Query("BSI-TR-03161-2", description="BSI TR code"),
|
|
force: bool = Query(False),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""
|
|
Extract requirements from BSI Technical Guidelines.
|
|
|
|
Uses pre-defined Pruefaspekte from BSI-TR-03161 documents.
|
|
"""
|
|
from ..services.regulation_scraper import RegulationScraperService
|
|
|
|
if not code.startswith("BSI"):
|
|
raise HTTPException(status_code=400, detail="Only BSI codes are supported")
|
|
|
|
scraper = RegulationScraperService(db)
|
|
|
|
try:
|
|
result = await scraper.scrape_single(code, force=force)
|
|
return result
|
|
except ValueError as e:
|
|
raise HTTPException(status_code=400, detail=str(e))
|
|
except Exception as e:
|
|
logger.error(f"BSI extraction failed: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/scraper/extract-pdf", response_model=PDFExtractionResponse)
|
|
async def extract_pdf_requirements(
|
|
request: PDFExtractionRequest,
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""
|
|
Extract Pruefaspekte from BSI-TR PDF documents using PyMuPDF.
|
|
|
|
Supported documents:
|
|
- BSI-TR-03161-1: General security requirements
|
|
- BSI-TR-03161-2: Web application security (OAuth, Sessions, etc.)
|
|
- BSI-TR-03161-3: Backend/server security
|
|
"""
|
|
from ..services.pdf_extractor import BSIPDFExtractor
|
|
from ..db.models import RegulationTypeEnum
|
|
|
|
# Map document codes to file paths
|
|
PDF_PATHS = {
|
|
"BSI-TR-03161-1": "/app/docs/BSI-TR-03161-1.pdf",
|
|
"BSI-TR-03161-2": "/app/docs/BSI-TR-03161-2.pdf",
|
|
"BSI-TR-03161-3": "/app/docs/BSI-TR-03161-3.pdf",
|
|
}
|
|
|
|
# Local development paths (fallback)
|
|
LOCAL_PDF_PATHS = {
|
|
"BSI-TR-03161-1": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-1.pdf",
|
|
"BSI-TR-03161-2": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-2.pdf",
|
|
"BSI-TR-03161-3": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-3.pdf",
|
|
}
|
|
|
|
doc_code = request.document_code.upper()
|
|
if doc_code not in PDF_PATHS:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported document: {doc_code}. Supported: {list(PDF_PATHS.keys())}"
|
|
)
|
|
|
|
# Try container path first, then local path
|
|
pdf_path = PDF_PATHS[doc_code]
|
|
if not os.path.exists(pdf_path):
|
|
pdf_path = LOCAL_PDF_PATHS.get(doc_code)
|
|
if not pdf_path or not os.path.exists(pdf_path):
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"PDF file not found for {doc_code}"
|
|
)
|
|
|
|
try:
|
|
extractor = BSIPDFExtractor()
|
|
aspects = extractor.extract_from_file(pdf_path, source_name=doc_code)
|
|
stats = extractor.get_statistics(aspects)
|
|
|
|
# Convert to response format
|
|
aspect_responses = [
|
|
BSIAspectResponse(
|
|
aspect_id=a.aspect_id,
|
|
title=a.title,
|
|
full_text=a.full_text[:2000],
|
|
category=a.category.value,
|
|
page_number=a.page_number,
|
|
section=a.section,
|
|
requirement_level=a.requirement_level.value,
|
|
source_document=a.source_document,
|
|
keywords=a.keywords,
|
|
related_aspects=a.related_aspects,
|
|
)
|
|
for a in aspects
|
|
]
|
|
|
|
requirements_created = 0
|
|
|
|
# Save to database if requested
|
|
if request.save_to_db:
|
|
# Get or create regulation
|
|
reg_repo = RegulationRepository(db)
|
|
regulation = reg_repo.get_by_code(doc_code)
|
|
|
|
if not regulation:
|
|
regulation = reg_repo.create(
|
|
code=doc_code,
|
|
name=f"BSI TR {doc_code.split('-')[-1]}",
|
|
full_name=f"BSI Technische Richtlinie {doc_code}",
|
|
regulation_type=RegulationTypeEnum.BSI_STANDARD,
|
|
local_pdf_path=pdf_path,
|
|
)
|
|
|
|
# Create requirements from extracted aspects
|
|
req_repo = RequirementRepository(db)
|
|
existing_articles = {r.article for r in req_repo.get_by_regulation(regulation.id)}
|
|
|
|
for aspect in aspects:
|
|
if aspect.aspect_id not in existing_articles or request.force:
|
|
# Delete existing if force
|
|
if request.force and aspect.aspect_id in existing_articles:
|
|
existing = db.query(RequirementDB).filter(
|
|
RequirementDB.regulation_id == regulation.id,
|
|
RequirementDB.article == aspect.aspect_id
|
|
).first()
|
|
if existing:
|
|
db.delete(existing)
|
|
|
|
# Determine priority based on requirement level
|
|
priority_map = {"MUSS": 3, "SOLL": 2, "KANN": 1, "DARF NICHT": 3}
|
|
priority = priority_map.get(aspect.requirement_level.value, 2)
|
|
|
|
requirement = RequirementDB(
|
|
id=str(uuid.uuid4()),
|
|
regulation_id=regulation.id,
|
|
article=aspect.aspect_id,
|
|
paragraph=aspect.section,
|
|
title=aspect.title[:300],
|
|
description=f"Kategorie: {aspect.category.value}",
|
|
requirement_text=aspect.full_text[:4000],
|
|
is_applicable=True,
|
|
priority=priority,
|
|
source_page=aspect.page_number,
|
|
source_section=aspect.section,
|
|
)
|
|
db.add(requirement)
|
|
requirements_created += 1
|
|
|
|
db.commit()
|
|
|
|
return PDFExtractionResponse(
|
|
success=True,
|
|
source_document=doc_code,
|
|
total_aspects=len(aspects),
|
|
aspects=aspect_responses,
|
|
statistics=stats,
|
|
requirements_created=requirements_created,
|
|
)
|
|
|
|
except ImportError as e:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"PyMuPDF not installed: {e}. Run: pip install PyMuPDF"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"PDF extraction failed for {doc_code}: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.get("/scraper/pdf-documents")
|
|
async def list_pdf_documents():
|
|
"""List available PDF documents for extraction."""
|
|
PDF_DOCS = [
|
|
{
|
|
"code": "BSI-TR-03161-1",
|
|
"name": "BSI TR 03161 Teil 1",
|
|
"description": "Allgemeine Sicherheitsanforderungen für mobile Anwendungen",
|
|
"expected_aspects": "~30",
|
|
},
|
|
{
|
|
"code": "BSI-TR-03161-2",
|
|
"name": "BSI TR 03161 Teil 2",
|
|
"description": "Web-Anwendungssicherheit (OAuth, Sessions, Input Validation, etc.)",
|
|
"expected_aspects": "~80-100",
|
|
},
|
|
{
|
|
"code": "BSI-TR-03161-3",
|
|
"name": "BSI TR 03161 Teil 3",
|
|
"description": "Backend/Server-Sicherheit",
|
|
"expected_aspects": "~40",
|
|
},
|
|
]
|
|
|
|
# Check which PDFs exist
|
|
for doc in PDF_DOCS:
|
|
local_path = f"/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/{doc['code']}.pdf"
|
|
container_path = f"/app/docs/{doc['code']}.pdf"
|
|
doc["available"] = os.path.exists(local_path) or os.path.exists(container_path)
|
|
|
|
return {
|
|
"documents": PDF_DOCS,
|
|
"total": len(PDF_DOCS),
|
|
}
|