fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
296
backend/compliance/api/scraper_routes.py
Normal file
296
backend/compliance/api/scraper_routes.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""
|
||||
FastAPI routes for Regulation Scraper and PDF extraction.
|
||||
|
||||
Endpoints:
|
||||
- /scraper/status: Scraper status
|
||||
- /scraper/sources: Available sources
|
||||
- /scraper/scrape-all: Scrape all sources
|
||||
- /scraper/scrape/{code}: Scrape single source
|
||||
- /scraper/extract-bsi: Extract BSI requirements
|
||||
- /scraper/extract-pdf: Extract from PDF
|
||||
- /scraper/pdf-documents: List available PDFs
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
|
||||
from ..db import RegulationRepository, RequirementRepository
|
||||
from ..db.models import RequirementDB
|
||||
from .schemas import BSIAspectResponse, PDFExtractionResponse, PDFExtractionRequest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["compliance-scraper"])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Regulation Scraper
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/scraper/status")
|
||||
async def get_scraper_status(db: Session = Depends(get_db)):
|
||||
"""Get current scraper status."""
|
||||
from ..services.regulation_scraper import RegulationScraperService
|
||||
|
||||
scraper = RegulationScraperService(db)
|
||||
return await scraper.get_status()
|
||||
|
||||
|
||||
@router.get("/scraper/sources")
|
||||
async def get_scraper_sources(db: Session = Depends(get_db)):
|
||||
"""Get list of known regulation sources."""
|
||||
from ..services.regulation_scraper import RegulationScraperService
|
||||
|
||||
scraper = RegulationScraperService(db)
|
||||
return {
|
||||
"sources": scraper.get_known_sources(),
|
||||
"total": len(scraper.KNOWN_SOURCES),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/scraper/scrape-all")
|
||||
async def scrape_all_sources(
|
||||
background_tasks: BackgroundTasks,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Start scraping all known regulation sources."""
|
||||
from ..services.regulation_scraper import RegulationScraperService
|
||||
|
||||
scraper = RegulationScraperService(db)
|
||||
results = await scraper.scrape_all()
|
||||
return {
|
||||
"status": "completed",
|
||||
"results": results,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/scraper/scrape/{code}")
|
||||
async def scrape_single_source(
|
||||
code: str,
|
||||
force: bool = Query(False, description="Force re-scrape even if data exists"),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Scrape a specific regulation source."""
|
||||
from ..services.regulation_scraper import RegulationScraperService
|
||||
|
||||
scraper = RegulationScraperService(db)
|
||||
|
||||
try:
|
||||
result = await scraper.scrape_single(code, force=force)
|
||||
return result
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping {code} failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/scraper/extract-bsi")
|
||||
async def extract_bsi_requirements(
|
||||
code: str = Query("BSI-TR-03161-2", description="BSI TR code"),
|
||||
force: bool = Query(False),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""
|
||||
Extract requirements from BSI Technical Guidelines.
|
||||
|
||||
Uses pre-defined Pruefaspekte from BSI-TR-03161 documents.
|
||||
"""
|
||||
from ..services.regulation_scraper import RegulationScraperService
|
||||
|
||||
if not code.startswith("BSI"):
|
||||
raise HTTPException(status_code=400, detail="Only BSI codes are supported")
|
||||
|
||||
scraper = RegulationScraperService(db)
|
||||
|
||||
try:
|
||||
result = await scraper.scrape_single(code, force=force)
|
||||
return result
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"BSI extraction failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/scraper/extract-pdf", response_model=PDFExtractionResponse)
|
||||
async def extract_pdf_requirements(
|
||||
request: PDFExtractionRequest,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""
|
||||
Extract Pruefaspekte from BSI-TR PDF documents using PyMuPDF.
|
||||
|
||||
Supported documents:
|
||||
- BSI-TR-03161-1: General security requirements
|
||||
- BSI-TR-03161-2: Web application security (OAuth, Sessions, etc.)
|
||||
- BSI-TR-03161-3: Backend/server security
|
||||
"""
|
||||
from ..services.pdf_extractor import BSIPDFExtractor
|
||||
from ..db.models import RegulationTypeEnum
|
||||
|
||||
# Map document codes to file paths
|
||||
PDF_PATHS = {
|
||||
"BSI-TR-03161-1": "/app/docs/BSI-TR-03161-1.pdf",
|
||||
"BSI-TR-03161-2": "/app/docs/BSI-TR-03161-2.pdf",
|
||||
"BSI-TR-03161-3": "/app/docs/BSI-TR-03161-3.pdf",
|
||||
}
|
||||
|
||||
# Local development paths (fallback)
|
||||
LOCAL_PDF_PATHS = {
|
||||
"BSI-TR-03161-1": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-1.pdf",
|
||||
"BSI-TR-03161-2": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-2.pdf",
|
||||
"BSI-TR-03161-3": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-3.pdf",
|
||||
}
|
||||
|
||||
doc_code = request.document_code.upper()
|
||||
if doc_code not in PDF_PATHS:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported document: {doc_code}. Supported: {list(PDF_PATHS.keys())}"
|
||||
)
|
||||
|
||||
# Try container path first, then local path
|
||||
pdf_path = PDF_PATHS[doc_code]
|
||||
if not os.path.exists(pdf_path):
|
||||
pdf_path = LOCAL_PDF_PATHS.get(doc_code)
|
||||
if not pdf_path or not os.path.exists(pdf_path):
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"PDF file not found for {doc_code}"
|
||||
)
|
||||
|
||||
try:
|
||||
extractor = BSIPDFExtractor()
|
||||
aspects = extractor.extract_from_file(pdf_path, source_name=doc_code)
|
||||
stats = extractor.get_statistics(aspects)
|
||||
|
||||
# Convert to response format
|
||||
aspect_responses = [
|
||||
BSIAspectResponse(
|
||||
aspect_id=a.aspect_id,
|
||||
title=a.title,
|
||||
full_text=a.full_text[:2000],
|
||||
category=a.category.value,
|
||||
page_number=a.page_number,
|
||||
section=a.section,
|
||||
requirement_level=a.requirement_level.value,
|
||||
source_document=a.source_document,
|
||||
keywords=a.keywords,
|
||||
related_aspects=a.related_aspects,
|
||||
)
|
||||
for a in aspects
|
||||
]
|
||||
|
||||
requirements_created = 0
|
||||
|
||||
# Save to database if requested
|
||||
if request.save_to_db:
|
||||
# Get or create regulation
|
||||
reg_repo = RegulationRepository(db)
|
||||
regulation = reg_repo.get_by_code(doc_code)
|
||||
|
||||
if not regulation:
|
||||
regulation = reg_repo.create(
|
||||
code=doc_code,
|
||||
name=f"BSI TR {doc_code.split('-')[-1]}",
|
||||
full_name=f"BSI Technische Richtlinie {doc_code}",
|
||||
regulation_type=RegulationTypeEnum.BSI_STANDARD,
|
||||
local_pdf_path=pdf_path,
|
||||
)
|
||||
|
||||
# Create requirements from extracted aspects
|
||||
req_repo = RequirementRepository(db)
|
||||
existing_articles = {r.article for r in req_repo.get_by_regulation(regulation.id)}
|
||||
|
||||
for aspect in aspects:
|
||||
if aspect.aspect_id not in existing_articles or request.force:
|
||||
# Delete existing if force
|
||||
if request.force and aspect.aspect_id in existing_articles:
|
||||
existing = db.query(RequirementDB).filter(
|
||||
RequirementDB.regulation_id == regulation.id,
|
||||
RequirementDB.article == aspect.aspect_id
|
||||
).first()
|
||||
if existing:
|
||||
db.delete(existing)
|
||||
|
||||
# Determine priority based on requirement level
|
||||
priority_map = {"MUSS": 3, "SOLL": 2, "KANN": 1, "DARF NICHT": 3}
|
||||
priority = priority_map.get(aspect.requirement_level.value, 2)
|
||||
|
||||
requirement = RequirementDB(
|
||||
id=str(uuid.uuid4()),
|
||||
regulation_id=regulation.id,
|
||||
article=aspect.aspect_id,
|
||||
paragraph=aspect.section,
|
||||
title=aspect.title[:300],
|
||||
description=f"Kategorie: {aspect.category.value}",
|
||||
requirement_text=aspect.full_text[:4000],
|
||||
is_applicable=True,
|
||||
priority=priority,
|
||||
source_page=aspect.page_number,
|
||||
source_section=aspect.section,
|
||||
)
|
||||
db.add(requirement)
|
||||
requirements_created += 1
|
||||
|
||||
db.commit()
|
||||
|
||||
return PDFExtractionResponse(
|
||||
success=True,
|
||||
source_document=doc_code,
|
||||
total_aspects=len(aspects),
|
||||
aspects=aspect_responses,
|
||||
statistics=stats,
|
||||
requirements_created=requirements_created,
|
||||
)
|
||||
|
||||
except ImportError as e:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"PyMuPDF not installed: {e}. Run: pip install PyMuPDF"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"PDF extraction failed for {doc_code}: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/scraper/pdf-documents")
|
||||
async def list_pdf_documents():
|
||||
"""List available PDF documents for extraction."""
|
||||
PDF_DOCS = [
|
||||
{
|
||||
"code": "BSI-TR-03161-1",
|
||||
"name": "BSI TR 03161 Teil 1",
|
||||
"description": "Allgemeine Sicherheitsanforderungen für mobile Anwendungen",
|
||||
"expected_aspects": "~30",
|
||||
},
|
||||
{
|
||||
"code": "BSI-TR-03161-2",
|
||||
"name": "BSI TR 03161 Teil 2",
|
||||
"description": "Web-Anwendungssicherheit (OAuth, Sessions, Input Validation, etc.)",
|
||||
"expected_aspects": "~80-100",
|
||||
},
|
||||
{
|
||||
"code": "BSI-TR-03161-3",
|
||||
"name": "BSI TR 03161 Teil 3",
|
||||
"description": "Backend/Server-Sicherheit",
|
||||
"expected_aspects": "~40",
|
||||
},
|
||||
]
|
||||
|
||||
# Check which PDFs exist
|
||||
for doc in PDF_DOCS:
|
||||
local_path = f"/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/{doc['code']}.pdf"
|
||||
container_path = f"/app/docs/{doc['code']}.pdf"
|
||||
doc["available"] = os.path.exists(local_path) or os.path.exists(container_path)
|
||||
|
||||
return {
|
||||
"documents": PDF_DOCS,
|
||||
"total": len(PDF_DOCS),
|
||||
}
|
||||
Reference in New Issue
Block a user