Files
breakpilot-compliance/backend-compliance/compliance/api/scraper_routes.py
Benjamin Boenisch 4435e7ea0a Initial commit: breakpilot-compliance - Compliance SDK Platform
Services: Admin-Compliance, Backend-Compliance,
AI-Compliance-SDK, Consent-SDK, Developer-Portal,
PCA-Platform, DSMS

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:28 +01:00

297 lines
10 KiB
Python

"""
FastAPI routes for Regulation Scraper and PDF extraction.
Endpoints:
- /scraper/status: Scraper status
- /scraper/sources: Available sources
- /scraper/scrape-all: Scrape all sources
- /scraper/scrape/{code}: Scrape single source
- /scraper/extract-bsi: Extract BSI requirements
- /scraper/extract-pdf: Extract from PDF
- /scraper/pdf-documents: List available PDFs
"""
import logging
import os
import uuid
from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
from sqlalchemy.orm import Session
from classroom_engine.database import get_db
from ..db import RegulationRepository, RequirementRepository
from ..db.models import RequirementDB
from .schemas import BSIAspectResponse, PDFExtractionResponse, PDFExtractionRequest
logger = logging.getLogger(__name__)
router = APIRouter(tags=["compliance-scraper"])
# ============================================================================
# Regulation Scraper
# ============================================================================
@router.get("/scraper/status")
async def get_scraper_status(db: Session = Depends(get_db)):
"""Get current scraper status."""
from ..services.regulation_scraper import RegulationScraperService
scraper = RegulationScraperService(db)
return await scraper.get_status()
@router.get("/scraper/sources")
async def get_scraper_sources(db: Session = Depends(get_db)):
"""Get list of known regulation sources."""
from ..services.regulation_scraper import RegulationScraperService
scraper = RegulationScraperService(db)
return {
"sources": scraper.get_known_sources(),
"total": len(scraper.KNOWN_SOURCES),
}
@router.post("/scraper/scrape-all")
async def scrape_all_sources(
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
):
"""Start scraping all known regulation sources."""
from ..services.regulation_scraper import RegulationScraperService
scraper = RegulationScraperService(db)
results = await scraper.scrape_all()
return {
"status": "completed",
"results": results,
}
@router.post("/scraper/scrape/{code}")
async def scrape_single_source(
code: str,
force: bool = Query(False, description="Force re-scrape even if data exists"),
db: Session = Depends(get_db),
):
"""Scrape a specific regulation source."""
from ..services.regulation_scraper import RegulationScraperService
scraper = RegulationScraperService(db)
try:
result = await scraper.scrape_single(code, force=force)
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Scraping {code} failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/scraper/extract-bsi")
async def extract_bsi_requirements(
code: str = Query("BSI-TR-03161-2", description="BSI TR code"),
force: bool = Query(False),
db: Session = Depends(get_db),
):
"""
Extract requirements from BSI Technical Guidelines.
Uses pre-defined Pruefaspekte from BSI-TR-03161 documents.
"""
from ..services.regulation_scraper import RegulationScraperService
if not code.startswith("BSI"):
raise HTTPException(status_code=400, detail="Only BSI codes are supported")
scraper = RegulationScraperService(db)
try:
result = await scraper.scrape_single(code, force=force)
return result
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"BSI extraction failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/scraper/extract-pdf", response_model=PDFExtractionResponse)
async def extract_pdf_requirements(
request: PDFExtractionRequest,
db: Session = Depends(get_db),
):
"""
Extract Pruefaspekte from BSI-TR PDF documents using PyMuPDF.
Supported documents:
- BSI-TR-03161-1: General security requirements
- BSI-TR-03161-2: Web application security (OAuth, Sessions, etc.)
- BSI-TR-03161-3: Backend/server security
"""
from ..services.pdf_extractor import BSIPDFExtractor
from ..db.models import RegulationTypeEnum
# Map document codes to file paths
PDF_PATHS = {
"BSI-TR-03161-1": "/app/docs/BSI-TR-03161-1.pdf",
"BSI-TR-03161-2": "/app/docs/BSI-TR-03161-2.pdf",
"BSI-TR-03161-3": "/app/docs/BSI-TR-03161-3.pdf",
}
# Local development paths (fallback)
LOCAL_PDF_PATHS = {
"BSI-TR-03161-1": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-1.pdf",
"BSI-TR-03161-2": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-2.pdf",
"BSI-TR-03161-3": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-3.pdf",
}
doc_code = request.document_code.upper()
if doc_code not in PDF_PATHS:
raise HTTPException(
status_code=400,
detail=f"Unsupported document: {doc_code}. Supported: {list(PDF_PATHS.keys())}"
)
# Try container path first, then local path
pdf_path = PDF_PATHS[doc_code]
if not os.path.exists(pdf_path):
pdf_path = LOCAL_PDF_PATHS.get(doc_code)
if not pdf_path or not os.path.exists(pdf_path):
raise HTTPException(
status_code=404,
detail=f"PDF file not found for {doc_code}"
)
try:
extractor = BSIPDFExtractor()
aspects = extractor.extract_from_file(pdf_path, source_name=doc_code)
stats = extractor.get_statistics(aspects)
# Convert to response format
aspect_responses = [
BSIAspectResponse(
aspect_id=a.aspect_id,
title=a.title,
full_text=a.full_text[:2000],
category=a.category.value,
page_number=a.page_number,
section=a.section,
requirement_level=a.requirement_level.value,
source_document=a.source_document,
keywords=a.keywords,
related_aspects=a.related_aspects,
)
for a in aspects
]
requirements_created = 0
# Save to database if requested
if request.save_to_db:
# Get or create regulation
reg_repo = RegulationRepository(db)
regulation = reg_repo.get_by_code(doc_code)
if not regulation:
regulation = reg_repo.create(
code=doc_code,
name=f"BSI TR {doc_code.split('-')[-1]}",
full_name=f"BSI Technische Richtlinie {doc_code}",
regulation_type=RegulationTypeEnum.BSI_STANDARD,
local_pdf_path=pdf_path,
)
# Create requirements from extracted aspects
req_repo = RequirementRepository(db)
existing_articles = {r.article for r in req_repo.get_by_regulation(regulation.id)}
for aspect in aspects:
if aspect.aspect_id not in existing_articles or request.force:
# Delete existing if force
if request.force and aspect.aspect_id in existing_articles:
existing = db.query(RequirementDB).filter(
RequirementDB.regulation_id == regulation.id,
RequirementDB.article == aspect.aspect_id
).first()
if existing:
db.delete(existing)
# Determine priority based on requirement level
priority_map = {"MUSS": 3, "SOLL": 2, "KANN": 1, "DARF NICHT": 3}
priority = priority_map.get(aspect.requirement_level.value, 2)
requirement = RequirementDB(
id=str(uuid.uuid4()),
regulation_id=regulation.id,
article=aspect.aspect_id,
paragraph=aspect.section,
title=aspect.title[:300],
description=f"Kategorie: {aspect.category.value}",
requirement_text=aspect.full_text[:4000],
is_applicable=True,
priority=priority,
source_page=aspect.page_number,
source_section=aspect.section,
)
db.add(requirement)
requirements_created += 1
db.commit()
return PDFExtractionResponse(
success=True,
source_document=doc_code,
total_aspects=len(aspects),
aspects=aspect_responses,
statistics=stats,
requirements_created=requirements_created,
)
except ImportError as e:
raise HTTPException(
status_code=500,
detail=f"PyMuPDF not installed: {e}. Run: pip install PyMuPDF"
)
except Exception as e:
logger.error(f"PDF extraction failed for {doc_code}: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/scraper/pdf-documents")
async def list_pdf_documents():
"""List available PDF documents for extraction."""
PDF_DOCS = [
{
"code": "BSI-TR-03161-1",
"name": "BSI TR 03161 Teil 1",
"description": "Allgemeine Sicherheitsanforderungen für mobile Anwendungen",
"expected_aspects": "~30",
},
{
"code": "BSI-TR-03161-2",
"name": "BSI TR 03161 Teil 2",
"description": "Web-Anwendungssicherheit (OAuth, Sessions, Input Validation, etc.)",
"expected_aspects": "~80-100",
},
{
"code": "BSI-TR-03161-3",
"name": "BSI TR 03161 Teil 3",
"description": "Backend/Server-Sicherheit",
"expected_aspects": "~40",
},
]
# Check which PDFs exist
for doc in PDF_DOCS:
local_path = f"/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/{doc['code']}.pdf"
container_path = f"/app/docs/{doc['code']}.pdf"
doc["available"] = os.path.exists(local_path) or os.path.exists(container_path)
return {
"documents": PDF_DOCS,
"total": len(PDF_DOCS),
}