fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/backend/compliance/api/scraper_routes.py
+++ b/backend/compliance/api/scraper_routes.py
@@ -0,0 +1,296 @@
+"""
+FastAPI routes for Regulation Scraper and PDF extraction.
+
+Endpoints:
+- /scraper/status: Scraper status
+- /scraper/sources: Available sources
+- /scraper/scrape-all: Scrape all sources
+- /scraper/scrape/{code}: Scrape single source
+- /scraper/extract-bsi: Extract BSI requirements
+- /scraper/extract-pdf: Extract from PDF
+- /scraper/pdf-documents: List available PDFs
+"""
+
+import logging
+import os
+import uuid
+
+from fastapi import APIRouter, Depends, HTTPException, Query, BackgroundTasks
+from sqlalchemy.orm import Session
+
+from classroom_engine.database import get_db
+
+from ..db import RegulationRepository, RequirementRepository
+from ..db.models import RequirementDB
+from .schemas import BSIAspectResponse, PDFExtractionResponse, PDFExtractionRequest
+
+logger = logging.getLogger(__name__)
+router = APIRouter(tags=["compliance-scraper"])
+
+
+# ============================================================================
+# Regulation Scraper
+# ============================================================================
+
+@router.get("/scraper/status")
+async def get_scraper_status(db: Session = Depends(get_db)):
+    """Get current scraper status."""
+    from ..services.regulation_scraper import RegulationScraperService
+
+    scraper = RegulationScraperService(db)
+    return await scraper.get_status()
+
+
+@router.get("/scraper/sources")
+async def get_scraper_sources(db: Session = Depends(get_db)):
+    """Get list of known regulation sources."""
+    from ..services.regulation_scraper import RegulationScraperService
+
+    scraper = RegulationScraperService(db)
+    return {
+        "sources": scraper.get_known_sources(),
+        "total": len(scraper.KNOWN_SOURCES),
+    }
+
+
+@router.post("/scraper/scrape-all")
+async def scrape_all_sources(
+    background_tasks: BackgroundTasks,
+    db: Session = Depends(get_db),
+):
+    """Start scraping all known regulation sources."""
+    from ..services.regulation_scraper import RegulationScraperService
+
+    scraper = RegulationScraperService(db)
+    results = await scraper.scrape_all()
+    return {
+        "status": "completed",
+        "results": results,
+    }
+
+
+@router.post("/scraper/scrape/{code}")
+async def scrape_single_source(
+    code: str,
+    force: bool = Query(False, description="Force re-scrape even if data exists"),
+    db: Session = Depends(get_db),
+):
+    """Scrape a specific regulation source."""
+    from ..services.regulation_scraper import RegulationScraperService
+
+    scraper = RegulationScraperService(db)
+
+    try:
+        result = await scraper.scrape_single(code, force=force)
+        return result
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"Scraping {code} failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/scraper/extract-bsi")
+async def extract_bsi_requirements(
+    code: str = Query("BSI-TR-03161-2", description="BSI TR code"),
+    force: bool = Query(False),
+    db: Session = Depends(get_db),
+):
+    """
+    Extract requirements from BSI Technical Guidelines.
+
+    Uses pre-defined Pruefaspekte from BSI-TR-03161 documents.
+    """
+    from ..services.regulation_scraper import RegulationScraperService
+
+    if not code.startswith("BSI"):
+        raise HTTPException(status_code=400, detail="Only BSI codes are supported")
+
+    scraper = RegulationScraperService(db)
+
+    try:
+        result = await scraper.scrape_single(code, force=force)
+        return result
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"BSI extraction failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/scraper/extract-pdf", response_model=PDFExtractionResponse)
+async def extract_pdf_requirements(
+    request: PDFExtractionRequest,
+    db: Session = Depends(get_db),
+):
+    """
+    Extract Pruefaspekte from BSI-TR PDF documents using PyMuPDF.
+
+    Supported documents:
+    - BSI-TR-03161-1: General security requirements
+    - BSI-TR-03161-2: Web application security (OAuth, Sessions, etc.)
+    - BSI-TR-03161-3: Backend/server security
+    """
+    from ..services.pdf_extractor import BSIPDFExtractor
+    from ..db.models import RegulationTypeEnum
+
+    # Map document codes to file paths
+    PDF_PATHS = {
+        "BSI-TR-03161-1": "/app/docs/BSI-TR-03161-1.pdf",
+        "BSI-TR-03161-2": "/app/docs/BSI-TR-03161-2.pdf",
+        "BSI-TR-03161-3": "/app/docs/BSI-TR-03161-3.pdf",
+    }
+
+    # Local development paths (fallback)
+    LOCAL_PDF_PATHS = {
+        "BSI-TR-03161-1": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-1.pdf",
+        "BSI-TR-03161-2": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-2.pdf",
+        "BSI-TR-03161-3": "/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/BSI-TR-03161-3.pdf",
+    }
+
+    doc_code = request.document_code.upper()
+    if doc_code not in PDF_PATHS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported document: {doc_code}. Supported: {list(PDF_PATHS.keys())}"
+        )
+
+    # Try container path first, then local path
+    pdf_path = PDF_PATHS[doc_code]
+    if not os.path.exists(pdf_path):
+        pdf_path = LOCAL_PDF_PATHS.get(doc_code)
+        if not pdf_path or not os.path.exists(pdf_path):
+            raise HTTPException(
+                status_code=404,
+                detail=f"PDF file not found for {doc_code}"
+            )
+
+    try:
+        extractor = BSIPDFExtractor()
+        aspects = extractor.extract_from_file(pdf_path, source_name=doc_code)
+        stats = extractor.get_statistics(aspects)
+
+        # Convert to response format
+        aspect_responses = [
+            BSIAspectResponse(
+                aspect_id=a.aspect_id,
+                title=a.title,
+                full_text=a.full_text[:2000],
+                category=a.category.value,
+                page_number=a.page_number,
+                section=a.section,
+                requirement_level=a.requirement_level.value,
+                source_document=a.source_document,
+                keywords=a.keywords,
+                related_aspects=a.related_aspects,
+            )
+            for a in aspects
+        ]
+
+        requirements_created = 0
+
+        # Save to database if requested
+        if request.save_to_db:
+            # Get or create regulation
+            reg_repo = RegulationRepository(db)
+            regulation = reg_repo.get_by_code(doc_code)
+
+            if not regulation:
+                regulation = reg_repo.create(
+                    code=doc_code,
+                    name=f"BSI TR {doc_code.split('-')[-1]}",
+                    full_name=f"BSI Technische Richtlinie {doc_code}",
+                    regulation_type=RegulationTypeEnum.BSI_STANDARD,
+                    local_pdf_path=pdf_path,
+                )
+
+            # Create requirements from extracted aspects
+            req_repo = RequirementRepository(db)
+            existing_articles = {r.article for r in req_repo.get_by_regulation(regulation.id)}
+
+            for aspect in aspects:
+                if aspect.aspect_id not in existing_articles or request.force:
+                    # Delete existing if force
+                    if request.force and aspect.aspect_id in existing_articles:
+                        existing = db.query(RequirementDB).filter(
+                            RequirementDB.regulation_id == regulation.id,
+                            RequirementDB.article == aspect.aspect_id
+                        ).first()
+                        if existing:
+                            db.delete(existing)
+
+                    # Determine priority based on requirement level
+                    priority_map = {"MUSS": 3, "SOLL": 2, "KANN": 1, "DARF NICHT": 3}
+                    priority = priority_map.get(aspect.requirement_level.value, 2)
+
+                    requirement = RequirementDB(
+                        id=str(uuid.uuid4()),
+                        regulation_id=regulation.id,
+                        article=aspect.aspect_id,
+                        paragraph=aspect.section,
+                        title=aspect.title[:300],
+                        description=f"Kategorie: {aspect.category.value}",
+                        requirement_text=aspect.full_text[:4000],
+                        is_applicable=True,
+                        priority=priority,
+                        source_page=aspect.page_number,
+                        source_section=aspect.section,
+                    )
+                    db.add(requirement)
+                    requirements_created += 1
+
+            db.commit()
+
+        return PDFExtractionResponse(
+            success=True,
+            source_document=doc_code,
+            total_aspects=len(aspects),
+            aspects=aspect_responses,
+            statistics=stats,
+            requirements_created=requirements_created,
+        )
+
+    except ImportError as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"PyMuPDF not installed: {e}. Run: pip install PyMuPDF"
+        )
+    except Exception as e:
+        logger.error(f"PDF extraction failed for {doc_code}: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/scraper/pdf-documents")
+async def list_pdf_documents():
+    """List available PDF documents for extraction."""
+    PDF_DOCS = [
+        {
+            "code": "BSI-TR-03161-1",
+            "name": "BSI TR 03161 Teil 1",
+            "description": "Allgemeine Sicherheitsanforderungen für mobile Anwendungen",
+            "expected_aspects": "~30",
+        },
+        {
+            "code": "BSI-TR-03161-2",
+            "name": "BSI TR 03161 Teil 2",
+            "description": "Web-Anwendungssicherheit (OAuth, Sessions, Input Validation, etc.)",
+            "expected_aspects": "~80-100",
+        },
+        {
+            "code": "BSI-TR-03161-3",
+            "name": "BSI TR 03161 Teil 3",
+            "description": "Backend/Server-Sicherheit",
+            "expected_aspects": "~40",
+        },
+    ]
+
+    # Check which PDFs exist
+    for doc in PDF_DOCS:
+        local_path = f"/Users/benjaminadmin/Projekte/breakpilot-pwa/docs/{doc['code']}.pdf"
+        container_path = f"/app/docs/{doc['code']}.pdf"
+        doc["available"] = os.path.exists(local_path) or os.path.exists(container_path)
+
+    return {
+        "documents": PDF_DOCS,
+        "total": len(PDF_DOCS),
+    }